-
-
Save tamimibrahim17/d1b7c3a371c8b43ca6950bb27860ae14 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e -o errexit -o pipefail -o nounset | |
################################### | |
# This script can be used by itself, but it's recommended that you read | |
# a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/ | |
################################### | |
# Do not modify these variables (set by Proxmox when calling the script) | |
vmId="$1" | |
runPhase="$2" | |
echo "Running $runPhase on VM=$vmId" | |
# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs. | |
# On 5900x the core config, as seen in lscpu -e, looks like the following: | |
# CCX #0: | |
# - NUMA: node 0 | |
# - CPU: 0-5, 12-17 (SMT threads/host CPU#) | |
# - CORE: 0-5 | |
# CCX #1: | |
# - NUMA: node 1 | |
# - CPU: 6-11, 18-23 | |
# - CORE: 6-11 | |
# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430 | |
# | |
# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different | |
# so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core | |
# There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md | |
# | |
# Useful commands while debugging this code: | |
# List running tasks with their affinity as of now: (the "]" filters out kthreads) | |
# ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS | |
# Track cgroups resources usage: systemd-cgtop | |
# See tree of cgroups: systemd-cgls | |
# Gets QEMU parent process PID for the current VM | |
getQemuPID () { | |
local qemuParentPid=$(cat /run/qemu-server/$vmId.pid) | |
if [[ -z $qemuParentPid ]]; then | |
echo "ERROR: failed to get QEMU parent PID for VM=$vmId" | |
return 1 | |
fi | |
echo $qemuParentPid | |
} | |
# Gets the last logical CPU (thread) of the system | |
getLastCpu () { | |
echo $(( $(nproc --all) - 1 )) | |
} | |
# Pin vCPU to a host logic CPU (thread) | |
# The theread SHOULD be a single one, but it can be any taskset list | |
# | |
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to | |
# sub-scopes, affinity has to be set per-process with taskset here. | |
# | |
# Params: vCPU# hostThread#orList | |
pinVCpu () { | |
local vCpuNum=$1 | |
local hostThreadNum="$2" | |
local qemuParentPid=$(getQemuPID) | |
local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5) | |
if [[ -z $vCpuTaskPid ]]; then | |
echo "ERROR: failed to get Task PID for vCPU $vCpuNum" | |
return 1 | |
fi | |
echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum" | |
taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid | |
} | |
# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s) | |
# There thread SHOULD probably be a list unlike pinVCpu | |
# | |
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to | |
# sub-scopes, affinity has to be set per-process with taskset here. | |
# | |
# Params: hostThread#orList | |
pinNonVCpuTasks () { | |
local hostThreadNum="$1" | |
local qemuParentPid=$(getQemuPID) | |
local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5) | |
while IFS= read -r tpid; do | |
local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm) | |
echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum" | |
taskset --cpu-list --pid "$hostThreadNum" $tpid | |
done <<< "$nonVCpuTaskPids" | |
} | |
# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus | |
# to control their affinity manual pinning is needed. | |
# There are hacky ways to identify kthreads like parsing "ps", but the proper way to | |
# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD | |
# mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740) | |
# | |
# Params: hostThread#orList | |
pinKthreads () { | |
local hostThreadNum="$1" | |
echo "Attempting to pin all kthreads to $hostThreadNum..." | |
local procStat="" | |
local pid="" | |
local comm="" | |
for statFile in /proc/[0-9]*/stat; do | |
# This CAN sometimes fail due to TOC-TOU | |
procStat="" | |
2>/dev/null read -a procStat < $statFile || true | |
if [[ -z "${procStat[0]}" ]]; then continue; fi | |
# Ignore not kthreads | |
flags="${procStat[8]}" | |
if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi | |
pid="${procStat[0]}" | |
comm="${procStat[1]:1:-1}" | |
# This CAN fail for some kthreads that are needed on specific CPUs | |
if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then | |
echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum" | |
fi | |
done | |
} | |
# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter | |
# when these are rescheduled. This function is not perfect as it doesn't set a mask | |
# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't | |
# be needed as if the VM isn't started on boot most if not all busy IRQs would have | |
# been triggered by now. | |
# | |
# Params: hostThread#orList | |
pinIrqs () { | |
local hostThreadNum="$1" | |
echo "Pinning IRQs to host thread(s) $hostThreadNum..." | |
for irqAffLst in /proc/irq/*/smp_affinity_list; do | |
local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+') | |
if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then | |
echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum" | |
fi | |
done | |
} | |
# Set governor/scaling for a host logic CPU (thread) | |
# Params: hostThread# desiredGovernor | |
setGovernor () { | |
local hostCpu=$1 | |
local reqGov="$2" | |
local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor) | |
if [[ -z "$curGov" ]]; then | |
echo "ERROR: failed to query governor for CPU $hostCpu" | |
return 1 | |
fi | |
if [[ "$reqGov" == "$curGov" ]]; then | |
echo "CPU $hostCpu: requested governor $reqGov - it is already set" | |
return | |
fi | |
echo "CPU $hostCpu: changing governor from $curGov to $reqGov" | |
echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor | |
} | |
# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive. | |
# Params: hostThreadFrom# hostThreadTo# desiredGovernor | |
setGovernorRange () { | |
for (( i=$1; i<=$2; i++ )); do | |
setGovernor $i "$3" | |
done | |
} | |
# Resets governor/scaling to default state | |
resetGovernor () { | |
echo "Resetting CPU governor to default" | |
service cpufrequtils restart | |
} | |
# Put host CPU (thread) into offline or online state | |
# Params: hostThread# desiredState{0,1} | |
setCpuState () { | |
local hostCpu=$1 | |
local reqState=$2 | |
local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online) | |
if [[ -z "$curState" ]]; then | |
echo "ERROR: failed to online status for CPU $hostCpu" | |
return 1 | |
fi | |
if [[ "$reqState" == "$curState" ]]; then | |
echo "CPU $hostCpu: requested state $reqState - it is already set" | |
return | |
fi | |
echo -n "CPU $hostCpu: changing state from $curState to $reqState... " | |
echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online | |
if [[ $? -eq 0 ]]; then | |
echo "[OK]" | |
else | |
echo "[FAILED]" | |
return 1 | |
fi | |
} | |
# Put host CPU (thread) range into offline or online state. Range is inclusive. | |
# Params: hostThreadFrom# hostThreadTo# desiredState{0,1} | |
setCpuStateRange () { | |
for (( i=$1; i<=$2; i++ )); do | |
setCpuState $i $3 | |
done | |
} | |
tidyCaches () { | |
echo -n "Tidying caches... " | |
sync | |
echo 3 > /proc/sys/vm/drop_caches | |
echo 1 > /proc/sys/vm/compact_memory | |
echo "[OK]" | |
} | |
# Sets cgroup slice or scope cpu isolation | |
# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19) | |
setCgroupAllowedCpus () { | |
local entity="$1" | |
local allowedCpus="$2" | |
echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus" | |
systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus" | |
} | |
# Sets logical CPUs (threads) which can be used by processes on the host | |
# Params: hostThreadsList (e.g. 11,12,13-19) | |
setHostAllowedCpus () { | |
echo "Setting host userland CPU constrain to $1" | |
setCgroupAllowedCpus "init.scope" "$1" | |
setCgroupAllowedCpus "system.slice" "$1" | |
setCgroupAllowedCpus "user.slice" "$1" | |
} | |
# Sets logical CPUs (threads) which can be QEMU processes | |
# Params: hostThreadsList (e.g. 11,12,13-19 | |
setQemuAllowedCpus () { | |
echo "Setting QEMU CPU default constrain to $1" | |
setCgroupAllowedCpus "qemu.slice" "$1" | |
} | |
# Makes sure that a decoupled slice for some QEMU VMs exist | |
# This will only do something the first time a VM start | |
# Params: <none> | |
ensureQemuDecoupledSlice () { | |
if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then | |
return 0 | |
fi | |
echo "Creating decoupled QEMU cgroup" | |
mkdir /sys/fs/cgroup/qemu-decoupled.slice | |
# The slice itself MUST be allowed to run on ALL CPUs. The reason | |
# for that is we will move vCPUs to an isolated set of cores BUT | |
# put emulator and iothread(s) on the shared CPUs. Since cgroups v2 | |
# doesn't allow a thread/task to be in a different cgroup than the | |
# parent these tasks must stay in the qemu-decoupled.slice but with | |
# different affinity | |
local lastCPU=$(getLastCpu) | |
setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU" | |
} | |
# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well | |
# as away from the standard qemu.slice used by Proxmox; see systemd-cgls | |
# | |
# All processes from host run under system.slice and user.slice, while all QEMU machines run | |
# under qemu.slice. Proxmox actually hardcodes that slice in their startup code: | |
# https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893 | |
# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU | |
# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is | |
# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate | |
# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't | |
# under any of the standard slices. However, this is not supported by systemd, as confirmed by one | |
# of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can | |
# be used directly (albeit without warranties). | |
# | |
# Params: <none> | |
decoupleQemuVm () { | |
ensureQemuDecoupledSlice | |
local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope" | |
if [[ ! -d "$vmScope" ]]; then | |
echo "Creating cgroups scope for VMID=$vmId at $vmScope" | |
mkdir "$vmScope" | |
fi | |
local qemuParentPid=$(getQemuPID) | |
echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope" | |
echo $qemuParentPid > "$vmScope/cgroup.procs" | |
} | |
# Starts/stops the "idle" windows VM to force very low GPU power states | |
setIdleVm () { | |
echo "Setting idle VM to $1" | |
qm "$1" 107 | |
} | |
# Since updates around 2023/03/20-22 GPUs and some other PCIe devices will only work once. | |
# When VM is turned off and on it will just black-screen and the VM never boots. This is a | |
# workaround for that issue. | |
# | |
# Params: <none> | |
resetVmPciDevices () { | |
echo "Resetting VM PCI devices..." | |
local pciAddrFun='' | |
local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?') | |
while IFS= read -r pciAddr; do | |
# Single function (mostly SR-IOV or vGPU) device | |
if echo "$pciAddr" | grep -F '.' > /dev/null; then | |
echo "Removing PCI device function at $pciAddr" | |
echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true | |
continue | |
fi | |
# Whole device specified => remove all function | |
for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do | |
pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*') | |
echo "Removing PCI device $pciAddr function $pciAddrFun" | |
echo 1 > "$pciAddrFunRm" || true | |
# This is absolutely required. Attempting to remove one function CAN | |
# remove all of them but it's not instantenous. However, if you hit | |
# such a case and try to manually do /remove on another function while | |
# the first is being removed a "general protection fault" will happen | |
# in the subsequent "pci_stop_and_remove_bus_device_locked()" | |
while [[ -f "$pciAddrFunRm" ]]; do | |
sleep 1 | |
echo "Still waiting for $pciAddrFunRm..." | |
done | |
done | |
done <<< "$vmPciDevices" | |
echo "Re-scanning PCI devices..." | |
echo 1 > /sys/bus/pci/rescan | |
# rescan is asynchronous; if we wanted to be 100% correct here we should wait | |
# for /sys entries to appear, but 2 seconds delay is good enough | |
sleep 2 | |
} | |
# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff | |
# All modifications should be done in post-start as doing them in pre-start will execute them even | |
# if the VM fails to start (and thus post-stop will never be called) | |
case "$runPhase" in | |
pre-start) | |
# Stop idle VM, drop caches & compact memory for hugepages | |
setIdleVm shutdown | |
tidyCaches | |
resetVmPciDevices | |
;; | |
# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff | |
# All modifications should be done in post-start as doing them in pre-start will execute them even | |
# if the VM fails to start (and thus post-stop will never be called) | |
post-start) | |
# This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX. | |
# This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker & | |
# ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/ | |
# I have no idea about any alternatives besides CPU hotplug hack (see below) | |
# WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset | |
# any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748 | |
# The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later. | |
setHostAllowedCpus "0-5,12-17" | |
setQemuAllowedCpus "0-5,12-17" | |
# Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily | |
echo "Offlining to-be pinned CPUs to move tasks away..." | |
setCpuStateRange 6 11 0 | |
setCpuStateRange 18 23 0 | |
# Move kernel threads & IRQs away from vCPU threads | |
# Doing this when CPUs are offlined makes it easier as | |
# nothing is running on these CPUs actively | |
pinIrqs "0-5,12-17" | |
pinKthreads "0-5,12-17" | |
# Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above | |
echo "Onlineing to-be pinned CPUs..." | |
setCpuStateRange 6 11 1 | |
setCpuStateRange 18 23 1 | |
# Set frequency scaling to performance mode | |
setGovernorRange 6 11 performance | |
setGovernorRange 18 23 performance | |
# Stats generation causes jitter in VR | |
sysctl vm.stat_interval=120 | |
# Migrate this VM to a separate isolation group (TLDR: see systemd-cgls) | |
# An alternative hacky way to do that would be to iterate over all currently running VMs and | |
# taskset their affinity to 1st CCX, but a new VM starting while this one is running will | |
# break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That | |
# requires the VM process to be moved to a non-qemu.slice | |
decoupleQemuVm | |
# Pin vCPUs to correct threads - this is crucial. | |
# Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned | |
# to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of | |
# first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc. | |
# In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with | |
# CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc | |
# mapping. | |
pinVCpu 0 6 | |
pinVCpu 1 18 | |
pinVCpu 2 7 | |
pinVCpu 3 19 | |
pinVCpu 4 8 | |
pinVCpu 5 20 | |
pinVCpu 6 9 | |
pinVCpu 7 21 | |
pinVCpu 8 10 | |
pinVCpu 9 22 | |
pinVCpu 10 11 | |
pinVCpu 11 23 | |
# Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should | |
# probabably be pinned to a single core, but we're counting on host scheduler being smart. | |
# To do static pinning here QMP needs to be used to query types of threads: | |
# https://wiki.qemu.org/Documentation/QMP | |
pinNonVCpuTasks "0-5,12-17" | |
;; | |
pre-stop) | |
;; | |
post-stop) | |
lastCpu=$(getLastCpu) | |
# Allow kthreads, IRQs, host & QEMU to use all CPUs again | |
pinKthreads "0-$lastCpu" | |
pinIrqs "0-$lastCpu" | |
setHostAllowedCpus "0-$lastCpu" | |
setQemuAllowedCpus "0-$lastCpu" | |
# Restore default scaling | |
resetGovernor | |
# Restore default virtual mem stats frequency | |
sysctl vm.stat_interval=1 | |
# Start idle VM | |
resetVmPciDevices | |
setIdleVm start | |
;; | |
*) | |
echo "Unknown run phase \"$runPhase\"!" | |
;; | |
esac | |
echo "Finished $runPhase on VM=$vmId" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment