Skip to content

Instantly share code, notes, and snippets.

@p7cq
Last active January 19, 2024 14:27
Show Gist options
  • Save p7cq/08f457d8ee071f009d8d8bfe8c98bea0 to your computer and use it in GitHub Desktop.
Save p7cq/08f457d8ee071f009d8d8bfe8c98bea0 to your computer and use it in GitHub Desktop.
Dynamic CPU isolation in QEMU/KVM
#!/usr/bin/env bash
#
# CPU isolation in QEMU/KVM
#
# As the cset scripts no longer work (systemd switched to cgroups v2), this is my
# attempt at emulating its functionality. It may be incorrect and/or it may break
# stuff. Blind copy-pasting with some reasoning follows.
#
# Host:
# - CPU: AMD Ryzen 9 3950X
# - OS: Arch Linux
# - VM: 5 x (4 CPUs, 16GB RAM)
#
# From https://www.reddit.com/r/VFIO/comments/ebe3l5/deprecated_isolcpus_workaround :
#echo "+cpuset"|tee /sys/fs/cgroup/cgroup.subtree_control
#echo "+cpuset"|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control
#echo "+cpuset"|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control
#echo "+cpuset"|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control
#
# https://www.reddit.com/r/VFIO/comments/mihb5j/systemd248_breaks_vm_boot_libvirt/# https://www.reddit.com/r/VFIO/comments/j1a5jm/cpu_pinning_explaination/
# https://www.reddit.com/r/VFIO/comments/ij25rg/splitting_ht_cores_between_host_and_vm/
# CPU frequency scaling tools (see wiki)
# cpupower frequency-info
# turbostat
# https://documentation.suse.com/sle-rt/15-SP1/pdf/art-slert-virtguide_color_en.pdf
# Disable interrupt balancing (irqbalance). The irqbalance daemon is enabled by default
# and it distributes hardware interrupts across CPUs in a multi-core system to increase
# performance. When irqbalance is disabled, all interrupts will be handled by cpu0, and
# therefore the quest should NOT run on cpu0. - how to do it?
#
[[ 1 = 2 ]] && echo -e "\n
+---------------------------+
| +----------+ |
| | CCD0 | |
| | +------+ | |
| | | CCX0 | | |
| +-------+ | +------+ | |
| | | | | CCX1 | | |
| | | | +------+ | |
| | I/O | +----------+ |
| | | +----------+ |
| | | | CCD1 | |
| +-------+ | +------+ | |
| | | CCX2 | | |
| | +------+ | |
| | | CCX3 | | |
| | +------+ | |
| +----------+ |
+---------------------------+
\n"
VM_NAME=vm1
VM_HUGEPAGES=8192
CPU_ALL="0-31"
CPU_ALL_MASK=ffffffff
CPU_HOST="0-7,16-19"
CPU_HOST_MASK=ff00f000
CPU_VM="8-15,21-31"
HV="qemu-system-x86_64"
NEED=("haproxy" "zfs-share" "nfs-server" "libvirtd")
for n in ${NEED[@]}; do
[[ "$(systemctl is-active ${n})" != "active" ]] && systemctl start ${n}
[[ "$(systemctl is-active ${n})" != "active" ]] && echo "failed" && exit 1
echo "${n} is up"
done
echo -e "\n\033[3mWait for \"\033[1mDomain $VM_NAME started\033[0m\033[3m\" message before starting other VMs.\033[0m\n"
if [ "$(virsh domstate $VM_NAME)" = "running" ]
then
echo "VM $VM_NAME already running"
exit 0
fi
echo "+cpuset"|tee /sys/fs/cgroup/cgroup.subtree_control
echo "+cpuset"|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control
echo "+cpuset"|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control
echo "+cpuset"|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control
# drop caches and compact such that memory is available in contiguous blocks
# ...but only if this if the first VM started
if [ -z "$(pidof $HV)" ]; then
echo "drop caches and compact memory"
# allow kernel to write as many dirty pages as possible
sync && \
# 1 - clear Page Cache only
# 2 - clear dentries and inodes
# 3 - clear all three
# under heavy memory and i/o load do not drop caches
echo 1 | tee /proc/sys/vm/drop_caches && \
# available when CONFIG_COMPACTION is set
echo 1 | tee /proc/sys/vm/compact_memory
fi
NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)
HUGEPAGES=$(echo "$NR_HUGEPAGES + $VM_HUGEPAGES" | bc)
echo $HUGEPAGES | tee /proc/sys/vm/nr_hugepages
ALLOC_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)
if [ "$ALLOC_HUGEPAGES" -ne "$HUGEPAGES" ]
then
echo Not able to allocate hugepages
echo $NR_HUGEPAGES | tee /proc/sys/vm/nr_hugepages
exit 1
fi
# ...but only if this is the first VM started
if [ -z "$(pidof $HV)" ]; then
# the kernel's dirty page writeback mechanism uses kthread workers. They introduce
# massive arbitrary latencies when doing disk writes on the host and aren't
# migrated by cset. Restrict the workqueue to use only cpu 0.
# how to determine?
echo 1 | tee /sys/devices/virtual/workqueue/cpumask
echo 0 | tee /sys/bus/workqueue/devices/writeback/numa
# all in
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
# selectively
#cpupower -c $CPU_VM frequency-set -g performance
# THP can allegedly result in jitter. Better keep it off.
echo never | tee /sys/kernel/mm/transparent_hugepage/enabled
fi
virsh start $VM_NAME
sleep 10
while [ "$(virsh domstate $VM_NAME)" = "running" ]
do
sleep 1
done
NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)
if ((NR_HUGEPAGES >= VM_HUGEPAGES)); then
echo $((NR_HUGEPAGES-VM_HUGEPAGES)) | tee /proc/sys/vm/nr_hugepages
fi
# reset to values observed before running this script
# ...but only if this was the last VM standing
if [ -z "$(pidof $HV)" ]; then
echo madvise | tee /sys/kernel/mm/transparent_hugepage/enabled
echo schedutil | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo $CPU_ALL_MASK | tee /sys/devices/virtual/workqueue/cpumask
echo 1 | tee /sys/bus/workqueue/devices/writeback/numa
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment