Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 1 | #!/bin/bash |
| 2 | |
| 3 | {{/* |
| 4 | Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | you may not use this file except in compliance with the License. |
| 6 | You may obtain a copy of the License at |
| 7 | |
| 8 | http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | |
| 10 | Unless required by applicable law or agreed to in writing, software |
| 11 | distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | See the License for the specific language governing permissions and |
| 14 | limitations under the License. |
| 15 | */}} |
| 16 | |
| 17 | set -ex |
| 18 | |
Mohammed Naser | 168acc3 | 2024-01-09 17:15:26 -0500 | [diff] [blame] | 19 | wait_for_file() { |
| 20 | local file=$1 |
okozachenko1203 | 14e3801 | 2023-09-01 01:40:19 +1000 | [diff] [blame] | 21 | |
Mohammed Naser | 168acc3 | 2024-01-09 17:15:26 -0500 | [diff] [blame] | 22 | while [ ! -f $file ]; do |
| 23 | sleep 1 |
| 24 | done |
| 25 | } |
okozachenko1203 | 14e3801 | 2023-09-01 01:40:19 +1000 | [diff] [blame] | 26 | |
Mohammed Naser | 168acc3 | 2024-01-09 17:15:26 -0500 | [diff] [blame] | 27 | wait_for_file {{ .Values.conf.libvirt.ca_file }} |
| 28 | wait_for_file /etc/pki/qemu/ca-cert.pem |
okozachenko1203 | 14e3801 | 2023-09-01 01:40:19 +1000 | [diff] [blame] | 29 | |
Mohammed Naser | 168acc3 | 2024-01-09 17:15:26 -0500 | [diff] [blame] | 30 | wait_for_file {{ .Values.conf.libvirt.cert_file }} |
| 31 | wait_for_file /etc/pki/libvirt/clientcert.pem |
| 32 | wait_for_file /etc/pki/qemu/server-cert.pem |
| 33 | wait_for_file /etc/pki/qemu/client-cert.pem |
okozachenko1203 | 14e3801 | 2023-09-01 01:40:19 +1000 | [diff] [blame] | 34 | |
Mohammed Naser | 168acc3 | 2024-01-09 17:15:26 -0500 | [diff] [blame] | 35 | wait_for_file {{ .Values.conf.libvirt.key_file }} |
| 36 | wait_for_file /etc/pki/libvirt/private/clientkey.pem |
| 37 | wait_for_file /etc/pki/qemu/server-key.pem |
| 38 | wait_for_file /etc/pki/qemu/client-key.pem |
| 39 | |
| 40 | wait_for_file /etc/pki/libvirt-vnc/ca-cert.pem |
| 41 | wait_for_file /etc/pki/libvirt-vnc/server-cert.pem |
| 42 | wait_for_file /etc/pki/libvirt-vnc/server-key.pem |
Mohammed Naser | f3bd28e | 2023-07-10 16:14:28 -0400 | [diff] [blame] | 43 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 44 | # TODO: We disable cgroup functionality for cgroup v2, we should fix this in the future |
| 45 | if $(stat -fc %T /sys/fs/cgroup/ | grep -q cgroup2fs); then |
| 46 | CGROUP_VERSION=v2 |
| 47 | else |
| 48 | CGROUP_VERSION=v1 |
| 49 | fi |
| 50 | |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 51 | if [ -n "$(cat /proc/*/comm 2>/dev/null | grep -w libvirtd)" ]; then |
| 52 | set +x |
| 53 | for proc in $(ls /proc/*/comm 2>/dev/null); do |
| 54 | if [ "x$(cat $proc 2>/dev/null | grep -w libvirtd)" == "xlibvirtd" ]; then |
| 55 | set -x |
| 56 | libvirtpid=$(echo $proc | cut -f 3 -d '/') |
| 57 | echo "WARNING: libvirtd daemon already running on host" 1>&2 |
| 58 | echo "$(cat "/proc/${libvirtpid}/status" 2>/dev/null | grep State)" 1>&2 |
| 59 | kill -9 "$libvirtpid" || true |
| 60 | set +x |
| 61 | fi |
| 62 | done |
| 63 | set -x |
| 64 | fi |
| 65 | |
| 66 | rm -f /var/run/libvirtd.pid |
| 67 | |
| 68 | if [[ -c /dev/kvm ]]; then |
| 69 | chmod 660 /dev/kvm |
| 70 | chown root:kvm /dev/kvm |
| 71 | fi |
| 72 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 73 | if [ $CGROUP_VERSION != "v2" ]; then |
| 74 | #Setup Cgroups to use when breaking out of Kubernetes defined groups |
| 75 | CGROUPS="" |
| 76 | for CGROUP in cpu rdma hugetlb; do |
| 77 | if [ -d /sys/fs/cgroup/${CGROUP} ]; then |
| 78 | CGROUPS+="${CGROUP}," |
| 79 | fi |
| 80 | done |
| 81 | cgcreate -g ${CGROUPS%,}:/osh-libvirt |
| 82 | fi |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 83 | |
| 84 | # We assume that if hugepage count > 0, then hugepages should be exposed to libvirt/qemu |
| 85 | hp_count="$(cat /proc/meminfo | grep HugePages_Total | tr -cd '[:digit:]')" |
| 86 | if [ 0"$hp_count" -gt 0 ]; then |
| 87 | |
| 88 | echo "INFO: Detected hugepage count of '$hp_count'. Enabling hugepage settings for libvirt/qemu." |
| 89 | |
| 90 | # Enable KVM hugepages for QEMU |
| 91 | if [ -n "$(grep KVM_HUGEPAGES=0 /etc/default/qemu-kvm)" ]; then |
| 92 | sed -i 's/.*KVM_HUGEPAGES=0.*/KVM_HUGEPAGES=1/g' /etc/default/qemu-kvm |
| 93 | else |
| 94 | echo KVM_HUGEPAGES=1 >> /etc/default/qemu-kvm |
| 95 | fi |
| 96 | |
| 97 | # Ensure that the hugepage mount location is available/mapped inside the |
| 98 | # container. This assumes use of the default ubuntu dev-hugepages.mount |
| 99 | # systemd unit which mounts hugepages at this location. |
| 100 | if [ ! -d /dev/hugepages ]; then |
| 101 | echo "ERROR: Hugepages configured in kernel, but libvirtd container cannot access /dev/hugepages" |
| 102 | exit 1 |
| 103 | fi |
| 104 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 105 | if [ $CGROUP_VERSION != "v2" ]; then |
| 106 | # Kubernetes 1.10.x introduced cgroup changes that caused the container's |
| 107 | # hugepage byte limit quota to zero out. This workaround sets that pod limit |
| 108 | # back to the total number of hugepage bytes available to the baremetal host. |
| 109 | if [ -d /sys/fs/cgroup/hugetlb ]; then |
| 110 | limits="$(ls /sys/fs/cgroup/hugetlb/{{ .Values.conf.kubernetes.cgroup }}/hugetlb.*.limit_in_bytes)" || \ |
| 111 | (echo "ERROR: Failed to locate any hugetable limits. Did you set the correct cgroup in your values used for this chart?" |
| 112 | exit 1) |
| 113 | for limit in $limits; do |
| 114 | target="/sys/fs/cgroup/hugetlb/$(dirname $(awk -F: '($2~/hugetlb/){print $3}' /proc/self/cgroup))/$(basename $limit)" |
| 115 | # Ensure the write target for the hugepage limit for the pod exists |
| 116 | if [ ! -f "$target" ]; then |
| 117 | echo "ERROR: Could not find write target for hugepage limit: $target" |
| 118 | fi |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 119 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 120 | # Write hugetable limit for pod |
| 121 | echo "$(cat $limit)" > "$target" |
| 122 | done |
| 123 | fi |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 124 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 125 | # Determine OS default hugepage size to use for the hugepage write test |
| 126 | default_hp_kb="$(cat /proc/meminfo | grep Hugepagesize | tr -cd '[:digit:]')" |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 127 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 128 | # Attempt to write to the hugepage mount to ensure it is operational, but only |
| 129 | # if we have at least 1 free page. |
| 130 | num_free_pages="$(cat /sys/kernel/mm/hugepages/hugepages-${default_hp_kb}kB/free_hugepages | tr -cd '[:digit:]')" |
| 131 | echo "INFO: '$num_free_pages' free hugepages of size ${default_hp_kb}kB" |
| 132 | if [ 0"$num_free_pages" -gt 0 ]; then |
| 133 | (fallocate -o0 -l "$default_hp_kb" /dev/hugepages/foo && rm /dev/hugepages/foo) || \ |
| 134 | (echo "ERROR: fallocate failed test at /dev/hugepages with size ${default_hp_kb}kB" |
| 135 | rm /dev/hugepages/foo |
| 136 | exit 1) |
| 137 | fi |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 138 | fi |
| 139 | fi |
| 140 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 141 | if [ -n "${LIBVIRT_CEPH_CINDER_SECRET_UUID}" ] || [ -n "${LIBVIRT_EXTERNAL_CEPH_CINDER_SECRET_UUID}" ] ; then |
| 142 | if [ $CGROUP_VERSION != "v2" ]; then |
| 143 | #NOTE(portdirect): run libvirtd as a transient unit on the host with the osh-libvirt cgroups applied. |
| 144 | cgexec -g ${CGROUPS%,}:/osh-libvirt systemd-run --scope --slice=system libvirtd --listen & |
| 145 | else |
| 146 | systemd-run --scope --slice=system libvirtd --listen & |
| 147 | fi |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 148 | |
| 149 | tmpsecret=$(mktemp --suffix .xml) |
| 150 | if [ -n "${LIBVIRT_EXTERNAL_CEPH_CINDER_SECRET_UUID}" ] ; then |
| 151 | tmpsecret2=$(mktemp --suffix .xml) |
| 152 | fi |
| 153 | function cleanup { |
| 154 | rm -f "${tmpsecret}" |
| 155 | if [ -n "${LIBVIRT_EXTERNAL_CEPH_CINDER_SECRET_UUID}" ] ; then |
| 156 | rm -f "${tmpsecret2}" |
| 157 | fi |
| 158 | } |
| 159 | trap cleanup EXIT |
| 160 | |
| 161 | # Wait for the libvirtd is up |
| 162 | TIMEOUT=60 |
| 163 | while [[ ! -f /var/run/libvirtd.pid ]]; do |
| 164 | if [[ ${TIMEOUT} -gt 0 ]]; then |
| 165 | let TIMEOUT-=1 |
| 166 | sleep 1 |
| 167 | else |
| 168 | echo "ERROR: libvirt did not start in time (pid file missing)" |
| 169 | exit 1 |
| 170 | fi |
| 171 | done |
| 172 | |
| 173 | # Even though we see the pid file the socket immediately (this is |
| 174 | # needed for virsh) |
| 175 | TIMEOUT=10 |
| 176 | while [[ ! -e /var/run/libvirt/libvirt-sock ]]; do |
| 177 | if [[ ${TIMEOUT} -gt 0 ]]; then |
| 178 | let TIMEOUT-=1 |
| 179 | sleep 1 |
| 180 | else |
| 181 | echo "ERROR: libvirt did not start in time (socket missing)" |
| 182 | exit 1 |
| 183 | fi |
| 184 | done |
| 185 | |
| 186 | function create_virsh_libvirt_secret { |
| 187 | sec_user=$1 |
| 188 | sec_uuid=$2 |
| 189 | sec_ceph_keyring=$3 |
| 190 | cat > ${tmpsecret} <<EOF |
| 191 | <secret ephemeral='no' private='no'> |
| 192 | <uuid>${sec_uuid}</uuid> |
| 193 | <usage type='ceph'> |
| 194 | <name>client.${sec_user}. secret</name> |
| 195 | </usage> |
| 196 | </secret> |
| 197 | EOF |
| 198 | virsh secret-define --file ${tmpsecret} |
| 199 | virsh secret-set-value --secret "${sec_uuid}" --base64 "${sec_ceph_keyring}" |
| 200 | } |
| 201 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 202 | if [ -z "${CEPH_CINDER_KEYRING}" ] && [ -n "${CEPH_CINDER_USER}" ] ; then |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 203 | CEPH_CINDER_KEYRING=$(awk '/key/{print $3}' /etc/ceph/ceph.client.${CEPH_CINDER_USER}.keyring) |
| 204 | fi |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 205 | if [ -n "${CEPH_CINDER_USER}" ] ; then |
| 206 | create_virsh_libvirt_secret ${CEPH_CINDER_USER} ${LIBVIRT_CEPH_CINDER_SECRET_UUID} ${CEPH_CINDER_KEYRING} |
| 207 | fi |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 208 | |
| 209 | if [ -n "${LIBVIRT_EXTERNAL_CEPH_CINDER_SECRET_UUID}" ] ; then |
| 210 | EXTERNAL_CEPH_CINDER_KEYRING=$(cat /tmp/external-ceph-client-keyring) |
| 211 | create_virsh_libvirt_secret ${EXTERNAL_CEPH_CINDER_USER} ${LIBVIRT_EXTERNAL_CEPH_CINDER_SECRET_UUID} ${EXTERNAL_CEPH_CINDER_KEYRING} |
| 212 | fi |
| 213 | |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 214 | cleanup |
| 215 | |
| 216 | # stop libvirtd; we needed it up to create secrets |
| 217 | LIBVIRTD_PID=$(cat /var/run/libvirtd.pid) |
| 218 | kill $LIBVIRTD_PID |
| 219 | tail --pid=$LIBVIRTD_PID -f /dev/null |
| 220 | |
| 221 | fi |
| 222 | |
| 223 | if [ $CGROUP_VERSION != "v2" ]; then |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 224 | #NOTE(portdirect): run libvirtd as a transient unit on the host with the osh-libvirt cgroups applied. |
okozachenko1203 | d2cd687 | 2023-08-31 19:53:51 +1000 | [diff] [blame] | 225 | cgexec -g ${CGROUPS%,}:/osh-libvirt systemd-run --scope --slice=system libvirtd --listen |
| 226 | else |
| 227 | systemd-run --scope --slice=system libvirtd --listen |
Mohammed Naser | f3f59a7 | 2023-01-15 21:02:04 -0500 | [diff] [blame] | 228 | fi |