Last active
January 19, 2024 14:27
-
-
Save p7cq/08f457d8ee071f009d8d8bfe8c98bea0 to your computer and use it in GitHub Desktop.
Dynamic CPU isolation in QEMU/KVM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# CPU isolation in QEMU/KVM | |
# | |
# As the cset scripts no longer work (systemd switched to cgroups v2), this is my | |
# attempt at emulating its functionality. It may be incorrect and/or it may break | |
# stuff. Blind copy-pasting with some reasoning follows. | |
# | |
# Host: | |
# - CPU: AMD Ryzen 9 3950X | |
# - OS: Arch Linux | |
# - VM: 5 x (4 CPUs, 16GB RAM) | |
# | |
# From https://www.reddit.com/r/VFIO/comments/ebe3l5/deprecated_isolcpus_workaround : | |
#echo "+cpuset"|tee /sys/fs/cgroup/cgroup.subtree_control | |
#echo "+cpuset"|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control | |
#echo "+cpuset"|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control | |
#echo "+cpuset"|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control | |
# | |
# https://www.reddit.com/r/VFIO/comments/mihb5j/systemd248_breaks_vm_boot_libvirt/# https://www.reddit.com/r/VFIO/comments/j1a5jm/cpu_pinning_explaination/ | |
# https://www.reddit.com/r/VFIO/comments/ij25rg/splitting_ht_cores_between_host_and_vm/ | |
# CPU frequency scaling tools (see wiki) | |
# cpupower frequency-info | |
# turbostat | |
# https://documentation.suse.com/sle-rt/15-SP1/pdf/art-slert-virtguide_color_en.pdf | |
# Disable interrupt balancing (irqbalance). The irqbalance daemon is enabled by default | |
# and it distributes hardware interrupts across CPUs in a multi-core system to increase | |
# performance. When irqbalance is disabled, all interrupts will be handled by cpu0, and | |
# therefore the quest should NOT run on cpu0. - how to do it? | |
# | |
[[ 1 = 2 ]] && echo -e "\n | |
+---------------------------+ | |
| +----------+ | | |
| | CCD0 | | | |
| | +------+ | | | |
| | | CCX0 | | | | |
| +-------+ | +------+ | | | |
| | | | | CCX1 | | | | |
| | | | +------+ | | | |
| | I/O | +----------+ | | |
| | | +----------+ | | |
| | | | CCD1 | | | |
| +-------+ | +------+ | | | |
| | | CCX2 | | | | |
| | +------+ | | | |
| | | CCX3 | | | | |
| | +------+ | | | |
| +----------+ | | |
+---------------------------+ | |
\n" | |
VM_NAME=vm1 | |
VM_HUGEPAGES=8192 | |
CPU_ALL="0-31" | |
CPU_ALL_MASK=ffffffff | |
CPU_HOST="0-7,16-19" | |
CPU_HOST_MASK=ff00f000 | |
CPU_VM="8-15,21-31" | |
HV="qemu-system-x86_64" | |
NEED=("haproxy" "zfs-share" "nfs-server" "libvirtd") | |
for n in ${NEED[@]}; do | |
[[ "$(systemctl is-active ${n})" != "active" ]] && systemctl start ${n} | |
[[ "$(systemctl is-active ${n})" != "active" ]] && echo "failed" && exit 1 | |
echo "${n} is up" | |
done | |
echo -e "\n\033[3mWait for \"\033[1mDomain $VM_NAME started\033[0m\033[3m\" message before starting other VMs.\033[0m\n" | |
if [ "$(virsh domstate $VM_NAME)" = "running" ] | |
then | |
echo "VM $VM_NAME already running" | |
exit 0 | |
fi | |
echo "+cpuset"|tee /sys/fs/cgroup/cgroup.subtree_control | |
echo "+cpuset"|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control | |
echo "+cpuset"|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control | |
echo "+cpuset"|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control | |
# drop caches and compact such that memory is available in contiguous blocks | |
# ...but only if this if the first VM started | |
if [ -z "$(pidof $HV)" ]; then | |
echo "drop caches and compact memory" | |
# allow kernel to write as many dirty pages as possible | |
sync && \ | |
# 1 - clear Page Cache only | |
# 2 - clear dentries and inodes | |
# 3 - clear all three | |
# under heavy memory and i/o load do not drop caches | |
echo 1 | tee /proc/sys/vm/drop_caches && \ | |
# available when CONFIG_COMPACTION is set | |
echo 1 | tee /proc/sys/vm/compact_memory | |
fi | |
NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages) | |
HUGEPAGES=$(echo "$NR_HUGEPAGES + $VM_HUGEPAGES" | bc) | |
echo $HUGEPAGES | tee /proc/sys/vm/nr_hugepages | |
ALLOC_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages) | |
if [ "$ALLOC_HUGEPAGES" -ne "$HUGEPAGES" ] | |
then | |
echo Not able to allocate hugepages | |
echo $NR_HUGEPAGES | tee /proc/sys/vm/nr_hugepages | |
exit 1 | |
fi | |
# ...but only if this is the first VM started | |
if [ -z "$(pidof $HV)" ]; then | |
# the kernel's dirty page writeback mechanism uses kthread workers. They introduce | |
# massive arbitrary latencies when doing disk writes on the host and aren't | |
# migrated by cset. Restrict the workqueue to use only cpu 0. | |
# how to determine? | |
echo 1 | tee /sys/devices/virtual/workqueue/cpumask | |
echo 0 | tee /sys/bus/workqueue/devices/writeback/numa | |
# all in | |
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor | |
# selectively | |
#cpupower -c $CPU_VM frequency-set -g performance | |
# THP can allegedly result in jitter. Better keep it off. | |
echo never | tee /sys/kernel/mm/transparent_hugepage/enabled | |
fi | |
virsh start $VM_NAME | |
sleep 10 | |
while [ "$(virsh domstate $VM_NAME)" = "running" ] | |
do | |
sleep 1 | |
done | |
NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages) | |
if ((NR_HUGEPAGES >= VM_HUGEPAGES)); then | |
echo $((NR_HUGEPAGES-VM_HUGEPAGES)) | tee /proc/sys/vm/nr_hugepages | |
fi | |
# reset to values observed before running this script | |
# ...but only if this was the last VM standing | |
if [ -z "$(pidof $HV)" ]; then | |
echo madvise | tee /sys/kernel/mm/transparent_hugepage/enabled | |
echo schedutil | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor | |
echo $CPU_ALL_MASK | tee /sys/devices/virtual/workqueue/cpumask | |
echo 1 | tee /sys/bus/workqueue/devices/writeback/numa | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment