p7cq · January 19, 2024 14:27
diff --git a/vm1-start.sh b/vm1-start.sh
 #!/usr/bin/env bash

 #
 # CPU isolation in QEMU/KVM
 #
 # As the cset scripts no longer work (systemd switched to cgroups v2), this is my
 # attempt at emulating its functionality. It may be incorrect and/or it may break
 # stuff. Blind copy-pasting with some reasoning follows.
 #
 # Host:
 # - CPU: AMD Ryzen 9 3950X
 # - OS: Arch Linux
 # - VM: 5 x (4 CPUs, 16GB RAM)
 #
 # From https://www.reddit.com/r/VFIO/comments/ebe3l5/deprecated_isolcpus_workaround :
 #echo "+cpuset"|tee /sys/fs/cgroup/cgroup.subtree_control
 #echo "+cpuset"|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control
 #echo "+cpuset"|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control
 #echo "+cpuset"|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control
 #
 # https://www.reddit.com/r/VFIO/comments/mihb5j/systemd248_breaks_vm_boot_libvirt/# https://www.reddit.com/r/VFIO/comments/j1a5jm/cpu_pinning_explaination/
 # https://www.reddit.com/r/VFIO/comments/ij25rg/splitting_ht_cores_between_host_and_vm/

 # CPU frequency scaling tools (see wiki)
 # cpupower frequency-info
 # turbostat

 # https://documentation.suse.com/sle-rt/15-SP1/pdf/art-slert-virtguide_color_en.pdf
 # Disable interrupt balancing (irqbalance). The irqbalance daemon is enabled by default
 # and it distributes hardware interrupts across CPUs in a multi-core system to increase
 # performance. When irqbalance is disabled, all interrupts will be handled by cpu0, and
 # therefore the quest should NOT run on cpu0. - how to do it?
 #

 [[ 1 = 2 ]] && echo -e "\n
    +---------------------------+
    |              +----------+ |
    |              |   CCD0   | |
    |              | +------+ | |
    |              | | CCX0 | | |
    |  +-------+   | +------+ | |
    |  |       |   | | CCX1 | | |
    |  |       |   | +------+ | |
    |  |  I/O  |   +----------+ |
    |  |       |   +----------+ |
    |  |       |   |   CCD1   | |
    |  +-------+   | +------+ | |
    |              | | CCX2 | | |
    |              | +------+ | |
    |              | | CCX3 | | |
    |              | +------+ | |
    |              +----------+ |
    +---------------------------+
 \n"

 VM_NAME=vm1
 VM_HUGEPAGES=8192

 CPU_ALL="0-31"
 CPU_ALL_MASK=ffffffff
 CPU_HOST="0-7,16-19"
 CPU_HOST_MASK=ff00f000
 CPU_VM="8-15,21-31"

 HV="qemu-system-x86_64"

 NEED=("haproxy" "zfs-share" "nfs-server" "libvirtd")
 for n in ${NEED[@]}; do
    [[ "$(systemctl is-active ${n})" != "active" ]] && systemctl start ${n}
    [[ "$(systemctl is-active ${n})" != "active" ]] && echo "failed" && exit 1
    echo "${n} is up"
 done

 echo -e "\n\033[3mWait for \"\033[1mDomain $VM_NAME started\033[0m\033[3m\" message before starting other VMs.\033[0m\n"

 if [ "$(virsh domstate $VM_NAME)" = "running" ]
 then
  echo "VM $VM_NAME already running"
  exit 0
 fi

 echo "+cpuset"|tee /sys/fs/cgroup/cgroup.subtree_control
 echo "+cpuset"|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control
 echo "+cpuset"|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control
 echo "+cpuset"|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control

 # drop caches and compact such that memory is available in contiguous blocks
 # ...but only if this if the first VM started
 if [ -z "$(pidof $HV)" ]; then
    echo "drop caches and compact memory"
    # allow kernel to write as many dirty pages as possible
    sync && \
    # 1 - clear Page Cache only
    # 2 - clear dentries and inodes
    # 3 - clear all three
    # under heavy memory and i/o load do not drop caches
    echo 1 | tee /proc/sys/vm/drop_caches && \
    # available when CONFIG_COMPACTION is set
    echo 1 | tee /proc/sys/vm/compact_memory
 fi

 NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)
 HUGEPAGES=$(echo "$NR_HUGEPAGES + $VM_HUGEPAGES" | bc)

 echo $HUGEPAGES | tee /proc/sys/vm/nr_hugepages
 ALLOC_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)

 if [ "$ALLOC_HUGEPAGES" -ne "$HUGEPAGES" ]
 then
  echo Not able to allocate hugepages
  echo $NR_HUGEPAGES | tee /proc/sys/vm/nr_hugepages
  exit 1
 fi

 # ...but only if this is the first VM started
 if [ -z "$(pidof $HV)" ]; then
    # the kernel's dirty page writeback mechanism uses kthread workers. They introduce
    # massive arbitrary latencies when doing disk writes on the host and aren't
    # migrated by cset. Restrict the workqueue to use only cpu 0.
    # how to determine?
    echo 1 | tee /sys/devices/virtual/workqueue/cpumask
    echo 0 | tee /sys/bus/workqueue/devices/writeback/numa

    # all in
    echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
    # selectively
    #cpupower -c $CPU_VM frequency-set -g performance

    # THP can allegedly result in jitter. Better keep it off.
    echo never | tee /sys/kernel/mm/transparent_hugepage/enabled
 fi

 virsh start $VM_NAME

 sleep 10
 while [ "$(virsh domstate $VM_NAME)" = "running" ]
 do
  sleep 1
 done

 NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)
 if ((NR_HUGEPAGES >= VM_HUGEPAGES)); then
    echo $((NR_HUGEPAGES-VM_HUGEPAGES)) | tee /proc/sys/vm/nr_hugepages
 fi

 # reset to values observed before running this script
 # ...but only if this was the last VM standing
 if [ -z "$(pidof $HV)" ]; then
  echo madvise | tee /sys/kernel/mm/transparent_hugepage/enabled
  echo schedutil | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
  echo $CPU_ALL_MASK | tee /sys/devices/virtual/workqueue/cpumask
  echo 1 | tee /sys/bus/workqueue/devices/writeback/numa
 fi
	#!/usr/bin/env bash

	#
	# CPU isolation in QEMU/KVM
	#
	# As the cset scripts no longer work (systemd switched to cgroups v2), this is my
	# attempt at emulating its functionality. It may be incorrect and/or it may break
	# stuff. Blind copy-pasting with some reasoning follows.
	#
	# Host:
	# - CPU: AMD Ryzen 9 3950X
	# - OS: Arch Linux
	# - VM: 5 x (4 CPUs, 16GB RAM)
	#
	# From https://www.reddit.com/r/VFIO/comments/ebe3l5/deprecated_isolcpus_workaround :
	#echo "+cpuset"\|tee /sys/fs/cgroup/cgroup.subtree_control
	#echo "+cpuset"\|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control
	#echo "+cpuset"\|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control
	#echo "+cpuset"\|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control
	#
	# https://www.reddit.com/r/VFIO/comments/mihb5j/systemd248_breaks_vm_boot_libvirt/# https://www.reddit.com/r/VFIO/comments/j1a5jm/cpu_pinning_explaination/
	# https://www.reddit.com/r/VFIO/comments/ij25rg/splitting_ht_cores_between_host_and_vm/

	# CPU frequency scaling tools (see wiki)
	# cpupower frequency-info
	# turbostat

	# https://documentation.suse.com/sle-rt/15-SP1/pdf/art-slert-virtguide_color_en.pdf
	# Disable interrupt balancing (irqbalance). The irqbalance daemon is enabled by default
	# and it distributes hardware interrupts across CPUs in a multi-core system to increase
	# performance. When irqbalance is disabled, all interrupts will be handled by cpu0, and
	# therefore the quest should NOT run on cpu0. - how to do it?
	#

	[[ 1 = 2 ]] && echo -e "\n
	+---------------------------+
	\| +----------+ \|
	\| \| CCD0 \| \|
	\| \| +------+ \| \|
	\| \| \| CCX0 \| \| \|
	\| +-------+ \| +------+ \| \|
	\| \| \| \| \| CCX1 \| \| \|
	\| \| \| \| +------+ \| \|
	\| \| I/O \| +----------+ \|
	\| \| \| +----------+ \|
	\| \| \| \| CCD1 \| \|
	\| +-------+ \| +------+ \| \|
	\| \| \| CCX2 \| \| \|
	\| \| +------+ \| \|
	\| \| \| CCX3 \| \| \|
	\| \| +------+ \| \|
	\| +----------+ \|
	+---------------------------+
	\n"

	VM_NAME=vm1
	VM_HUGEPAGES=8192

	CPU_ALL="0-31"
	CPU_ALL_MASK=ffffffff
	CPU_HOST="0-7,16-19"
	CPU_HOST_MASK=ff00f000
	CPU_VM="8-15,21-31"

	HV="qemu-system-x86_64"

	NEED=("haproxy" "zfs-share" "nfs-server" "libvirtd")
	for n in ${NEED[@]}; do
	[[ "$(systemctl is-active ${n})" != "active" ]] && systemctl start ${n}
	[[ "$(systemctl is-active ${n})" != "active" ]] && echo "failed" && exit 1
	echo "${n} is up"
	done

	echo -e "\n\033[3mWait for \"\033[1mDomain $VM_NAME started\033[0m\033[3m\" message before starting other VMs.\033[0m\n"

	if [ "$(virsh domstate $VM_NAME)" = "running" ]
	then
	echo "VM $VM_NAME already running"
	exit 0
	fi

	echo "+cpuset"\|tee /sys/fs/cgroup/cgroup.subtree_control
	echo "+cpuset"\|tee /sys/fs/cgroup/user.slice/cgroup.subtree_control
	echo "+cpuset"\|tee /sys/fs/cgroup/system.slice/cgroup.subtree_control
	echo "+cpuset"\|tee /sys/fs/cgroup/init.scope/cgroup.subtree_control

	# drop caches and compact such that memory is available in contiguous blocks
	# ...but only if this if the first VM started
	if [ -z "$(pidof $HV)" ]; then
	echo "drop caches and compact memory"
	# allow kernel to write as many dirty pages as possible
	sync && \
	# 1 - clear Page Cache only
	# 2 - clear dentries and inodes
	# 3 - clear all three
	# under heavy memory and i/o load do not drop caches
	echo 1 \| tee /proc/sys/vm/drop_caches && \
	# available when CONFIG_COMPACTION is set
	echo 1 \| tee /proc/sys/vm/compact_memory
	fi

	NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)
	HUGEPAGES=$(echo "$NR_HUGEPAGES + $VM_HUGEPAGES" \| bc)

	echo $HUGEPAGES \| tee /proc/sys/vm/nr_hugepages
	ALLOC_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)

	if [ "$ALLOC_HUGEPAGES" -ne "$HUGEPAGES" ]
	then
	echo Not able to allocate hugepages
	echo $NR_HUGEPAGES \| tee /proc/sys/vm/nr_hugepages
	exit 1
	fi

	# ...but only if this is the first VM started
	if [ -z "$(pidof $HV)" ]; then
	# the kernel's dirty page writeback mechanism uses kthread workers. They introduce
	# massive arbitrary latencies when doing disk writes on the host and aren't
	# migrated by cset. Restrict the workqueue to use only cpu 0.
	# how to determine?
	echo 1 \| tee /sys/devices/virtual/workqueue/cpumask
	echo 0 \| tee /sys/bus/workqueue/devices/writeback/numa

	# all in
	echo performance \| tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
	# selectively
	#cpupower -c $CPU_VM frequency-set -g performance

	# THP can allegedly result in jitter. Better keep it off.
	echo never \| tee /sys/kernel/mm/transparent_hugepage/enabled
	fi

	virsh start $VM_NAME

	sleep 10
	while [ "$(virsh domstate $VM_NAME)" = "running" ]
	do
	sleep 1
	done

	NR_HUGEPAGES=$(cat /proc/sys/vm/nr_hugepages)
	if ((NR_HUGEPAGES >= VM_HUGEPAGES)); then
	echo $((NR_HUGEPAGES-VM_HUGEPAGES)) \| tee /proc/sys/vm/nr_hugepages
	fi

	# reset to values observed before running this script
	# ...but only if this was the last VM standing
	if [ -z "$(pidof $HV)" ]; then
	echo madvise \| tee /sys/kernel/mm/transparent_hugepage/enabled
	echo schedutil \| tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
	echo $CPU_ALL_MASK \| tee /sys/devices/virtual/workqueue/cpumask
	echo 1 \| tee /sys/bus/workqueue/devices/writeback/numa
	fi