-
-
Save kiler129/215e2c8de853209ca429ad5ed40ce128 to your computer and use it in GitHub Desktop.
#!/bin/bash | |
set -e -o errexit -o pipefail -o nounset | |
################################### | |
# This script can be used by itself, but it's recommended that you read | |
# a tutorial on Proxmox forum first: https://forum.proxmox.com/threads/hey-proxmox-community-lets-talk-about-resources-isolation.124256/ | |
################################### | |
# Do not modify these variables (set by Proxmox when calling the script) | |
vmId="$1" | |
runPhase="$2" | |
echo "Running $runPhase on VM=$vmId" | |
# vCPU pinning should be done 1:1 between guest and host, especially on systems using NUMA and/or CCDs. | |
# On 5900x the core config, as seen in lscpu -e, looks like the following: | |
# CCX #0: | |
# - NUMA: node 0 | |
# - CPU: 0-5, 12-17 (SMT threads/host CPU#) | |
# - CORE: 0-5 | |
# CCX #1: | |
# - NUMA: node 1 | |
# - CPU: 6-11, 18-23 | |
# - CORE: 6-11 | |
# "lstopo" shouldn't be used here, as it has a bug when RAM is not NUMA but L3 is: https://github.com/open-mpi/hwloc/issues/430 | |
# | |
# VM should be this can be semi-automated with scripts taking into account NUMA etc, but every system is different | |
# so, it's better to conciously tune it. Some scripts are here: https://github.com/64kramsystem/qemu-pinning#one-vcpus-per-corethread-except-one-core | |
# There are some unexplored ideas also at https://github.com/rokups/rokups.github.io/blob/master/pages/gaming-vm-performance.md | |
# | |
# Useful commands while debugging this code: | |
# List running tasks with their affinity as of now: (the "]" filters out kthreads) | |
# ps -T -e -o psr,pid,ppid,pgid,sid,comm,cmd | grep -P '^\s+(6|7|8|9|10|11|18|19|20|21|22|23)' | grep -v -P '\]$' | sort | cut -c-$COLUMNS | |
# Track cgroups resources usage: systemd-cgtop | |
# See tree of cgroups: systemd-cgls | |
# Gets QEMU parent process PID for the current VM | |
getQemuPID () { | |
local qemuParentPid=$(cat /run/qemu-server/$vmId.pid) | |
if [[ -z $qemuParentPid ]]; then | |
echo "ERROR: failed to get QEMU parent PID for VM=$vmId" | |
return 1 | |
fi | |
echo $qemuParentPid | |
} | |
# Gets the last logical CPU (thread) of the system | |
getLastCpu () { | |
echo $(( $(nproc --all) - 1 )) | |
} | |
# Pin vCPU to a host logic CPU (thread) | |
# The theread SHOULD be a single one, but it can be any taskset list | |
# | |
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to | |
# sub-scopes, affinity has to be set per-process with taskset here. | |
# | |
# Params: vCPU# hostThread#orList | |
pinVCpu () { | |
local vCpuNum=$1 | |
local hostThreadNum="$2" | |
local qemuParentPid=$(getQemuPID) | |
local vCpuTaskPid=$(grep "^CPU $vCpuNum/KVM\$" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5) | |
if [[ -z $vCpuTaskPid ]]; then | |
echo "ERROR: failed to get Task PID for vCPU $vCpuNum" | |
return 1 | |
fi | |
echo "Pinning VM $vmId (PPID=$qemuParentPid) vCPU $vCpuNum (TPID=$vCpuTaskPid) to host thread(s) $hostThreadNum" | |
taskset --cpu-list --pid "$hostThreadNum" $vCpuTaskPid | |
} | |
# Pins all non-vCPU QEMU threads (io, emulator, rcu) to a host logic CPU(s) | |
# There thread SHOULD probably be a list unlike pinVCpu | |
# | |
# Since cgroups v2 (used by Proxmox) does NOT allow moving tasks/thread to | |
# sub-scopes, affinity has to be set per-process with taskset here. | |
# | |
# Params: hostThread#orList | |
pinNonVCpuTasks () { | |
local hostThreadNum="$1" | |
local qemuParentPid=$(getQemuPID) | |
local nonVCpuTaskPids=$(grep -v -P "^CPU \d" /proc/$qemuParentPid/task/*/comm | cut -d '/' -f5) | |
while IFS= read -r tpid; do | |
local taskComm=$(cat /proc/$qemuParentPid/task/$tpid/comm) | |
echo "Pinning VM $vmId (PPID=$qemuParentPid) non-vCPU task \"$taskComm\" (TPID=$tpid) to host thread(s) $hostThreadNum" | |
taskset --cpu-list --pid "$hostThreadNum" $tpid | |
done <<< "$nonVCpuTaskPids" | |
} | |
# Kernel threads (so-called "kthreads") aren't grouped under any of the cgroups. Thus | |
# to control their affinity manual pinning is needed. | |
# There are hacky ways to identify kthreads like parsing "ps", but the proper way to | |
# that is to actually check the thread type. All kernel threads are marked with PF_KTHREAD | |
# mask (see https://elixir.bootlin.com/linux/v6.3-rc6/source/include/linux/sched.h#L1740) | |
# | |
# Params: hostThread#orList | |
pinKthreads () { | |
local hostThreadNum="$1" | |
echo "Attempting to pin all kthreads to $hostThreadNum..." | |
local procStat="" | |
local pid="" | |
local comm="" | |
for statFile in /proc/[0-9]*/stat; do | |
# This CAN sometimes fail due to TOC-TOU | |
procStat="" | |
2>/dev/null read -a procStat < $statFile || true | |
if [[ -z "${procStat[0]}" ]]; then continue; fi | |
# Ignore not kthreads | |
flags="${procStat[8]}" | |
if (( ($flags & 0x00200000) != 0x00200000 )); then continue; fi | |
pid="${procStat[0]}" | |
comm="${procStat[1]:1:-1}" | |
# This CAN fail for some kthreads that are needed on specific CPUs | |
if taskset --cpu-list --pid "$hostThreadNum" $pid > /dev/null 2>&1; then | |
echo "Pinned kthread \"$comm\" (PID=$pid) to host thread(s) $hostThreadNum" | |
fi | |
done | |
} | |
# Most IRQs can be moved away from the threads running vCPUs, that can cause jitter | |
# when these are rescheduled. This function is not perfect as it doesn't set a mask | |
# for not-yet-triggered IRQs (/proc/irq/default_smp_affinity). However, this shouldn't | |
# be needed as if the VM isn't started on boot most if not all busy IRQs would have | |
# been triggered by now. | |
# | |
# Params: hostThread#orList | |
pinIrqs () { | |
local hostThreadNum="$1" | |
echo "Pinning IRQs to host thread(s) $hostThreadNum..." | |
for irqAffLst in /proc/irq/*/smp_affinity_list; do | |
local irqNum=$(echo "$irqAffLst" | grep -o -E '[0-9]+') | |
if echo "$hostThreadNum" > $irqAffLst 2> /dev/null; then | |
echo "Pinned IRQ $irqNum to host thread(s) $hostThreadNum" | |
fi | |
done | |
} | |
# Set governor/scaling for a host logic CPU (thread) | |
# Params: hostThread# desiredGovernor | |
setGovernor () { | |
local hostCpu=$1 | |
local reqGov="$2" | |
local curGov=$(cat /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor) | |
if [[ -z "$curGov" ]]; then | |
echo "ERROR: failed to query governor for CPU $hostCpu" | |
return 1 | |
fi | |
if [[ "$reqGov" == "$curGov" ]]; then | |
echo "CPU $hostCpu: requested governor $reqGov - it is already set" | |
return | |
fi | |
echo "CPU $hostCpu: changing governor from $curGov to $reqGov" | |
echo "$reqGov" > /sys/devices/system/cpu/cpu$hostCpu/cpufreq/scaling_governor | |
} | |
# Sets governor/scaling on a range of host CPUs (threads). Range is inclusive. | |
# Params: hostThreadFrom# hostThreadTo# desiredGovernor | |
setGovernorRange () { | |
for (( i=$1; i<=$2; i++ )); do | |
setGovernor $i "$3" | |
done | |
} | |
# Resets governor/scaling to default state | |
resetGovernor () { | |
echo "Resetting CPU governor to default" | |
service cpufrequtils restart | |
} | |
# Put host CPU (thread) into offline or online state | |
# Params: hostThread# desiredState{0,1} | |
setCpuState () { | |
local hostCpu=$1 | |
local reqState=$2 | |
local curState=$(cat /sys/devices/system/cpu/cpu$hostCpu/online) | |
if [[ -z "$curState" ]]; then | |
echo "ERROR: failed to online status for CPU $hostCpu" | |
return 1 | |
fi | |
if [[ "$reqState" == "$curState" ]]; then | |
echo "CPU $hostCpu: requested state $reqState - it is already set" | |
return | |
fi | |
echo -n "CPU $hostCpu: changing state from $curState to $reqState... " | |
echo $reqState > /sys/devices/system/cpu/cpu$hostCpu/online | |
if [[ $? -eq 0 ]]; then | |
echo "[OK]" | |
else | |
echo "[FAILED]" | |
return 1 | |
fi | |
} | |
# Put host CPU (thread) range into offline or online state. Range is inclusive. | |
# Params: hostThreadFrom# hostThreadTo# desiredState{0,1} | |
setCpuStateRange () { | |
for (( i=$1; i<=$2; i++ )); do | |
setCpuState $i $3 | |
done | |
} | |
tidyCaches () { | |
echo -n "Tidying caches... " | |
sync | |
echo 3 > /proc/sys/vm/drop_caches | |
echo 1 > /proc/sys/vm/compact_memory | |
echo "[OK]" | |
} | |
# Sets cgroup slice or scope cpu isolation | |
# Params: sliceOrScopeName hostThreadsList (e.g. 11,12,13-19) | |
setCgroupAllowedCpus () { | |
local entity="$1" | |
local allowedCpus="$2" | |
echo "Forcing \"$entity\" cgroup to only use CPU(s) $allowedCpus" | |
systemctl set-property --runtime -- "$entity" "AllowedCPUs=$allowedCpus" | |
} | |
# Sets logical CPUs (threads) which can be used by processes on the host | |
# Params: hostThreadsList (e.g. 11,12,13-19) | |
setHostAllowedCpus () { | |
echo "Setting host userland CPU constrain to $1" | |
setCgroupAllowedCpus "init.scope" "$1" | |
setCgroupAllowedCpus "system.slice" "$1" | |
setCgroupAllowedCpus "user.slice" "$1" | |
} | |
# Sets logical CPUs (threads) which can be QEMU processes | |
# Params: hostThreadsList (e.g. 11,12,13-19 | |
setQemuAllowedCpus () { | |
echo "Setting QEMU CPU default constrain to $1" | |
setCgroupAllowedCpus "qemu.slice" "$1" | |
} | |
# Makes sure that a decoupled slice for some QEMU VMs exist | |
# This will only do something the first time a VM start | |
# Params: <none> | |
ensureQemuDecoupledSlice () { | |
if [[ -d "/sys/fs/cgroup/qemu-decoupled.slice" ]]; then | |
return 0 | |
fi | |
echo "Creating decoupled QEMU cgroup" | |
mkdir /sys/fs/cgroup/qemu-decoupled.slice | |
# The slice itself MUST be allowed to run on ALL CPUs. The reason | |
# for that is we will move vCPUs to an isolated set of cores BUT | |
# put emulator and iothread(s) on the shared CPUs. Since cgroups v2 | |
# doesn't allow a thread/task to be in a different cgroup than the | |
# parent these tasks must stay in the qemu-decoupled.slice but with | |
# different affinity | |
local lastCPU=$(getLastCpu) | |
setCgroupAllowedCpus "qemu-decoupled.slice" "0-$lastCPU" | |
} | |
# Moves the VM to an isolated cgroup, outside of the OS user/system/init groups, as well | |
# as away from the standard qemu.slice used by Proxmox; see systemd-cgls | |
# | |
# All processes from host run under system.slice and user.slice, while all QEMU machines run | |
# under qemu.slice. Proxmox actually hardcodes that slice in their startup code: | |
# https://github.com/proxmox/qemu-server/blob/79f5ca393ab3608ff2e82c929167f079f964a505/PVE/QemuServer.pm#L5892-L5893 | |
# This means that setting "setQemuAllowedCpus" to 1st CCX makes it impossible to pin vCPU | |
# threads to the 2nd CCX (taskset willl fail), as the parent slice where the thread/service is | |
# running will enforce 1st CCX only AllowedCPUs. The only way around this I found is to migrate | |
# the VM scope (each one gets a separate one named <VMID>.scope) to a different scope which isn't | |
# under any of the standard slices. However, this is not supported by systemd, as confirmed by one | |
# of the systemd authors: https://www.spinics.net/lists/systemd-devel/msg04072.html but cgropups can | |
# be used directly (albeit without warranties). | |
# | |
# Params: <none> | |
decoupleQemuVm () { | |
ensureQemuDecoupledSlice | |
local vmScope="/sys/fs/cgroup/qemu-decoupled.slice/$vmId.scope" | |
if [[ ! -d "$vmScope" ]]; then | |
echo "Creating cgroups scope for VMID=$vmId at $vmScope" | |
mkdir "$vmScope" | |
fi | |
local qemuParentPid=$(getQemuPID) | |
echo "Migrating VMID=$vmId PPID=$qemuParentPid to scope $vmScope" | |
echo $qemuParentPid > "$vmScope/cgroup.procs" | |
} | |
# Starts/stops the "idle" windows VM to force very low GPU power states | |
setIdleVm () { | |
echo "Setting idle VM to $1" | |
qm "$1" 107 | |
} | |
# Since updates around 2023/03/20-22 GPUs and some other PCIe devices will only work once. | |
# When VM is turned off and on it will just black-screen and the VM never boots. This is a | |
# workaround for that issue. | |
# | |
# Params: <none> | |
resetVmPciDevices () { | |
echo "Resetting VM PCI devices..." | |
local pciAddrFun='' | |
local vmPciDevices=$(grep -E '^hostpci[0-9]+:' "/etc/pve/qemu-server/$vmId.conf" | grep -o -E '[0-9a-f]+:[0-9a-f]+:[0-9a-f]+(\.[0-9]*)?') | |
while IFS= read -r pciAddr; do | |
# Single function (mostly SR-IOV or vGPU) device | |
if echo "$pciAddr" | grep -F '.' > /dev/null; then | |
echo "Removing PCI device function at $pciAddr" | |
echo 1 > "/sys/bus/pci/devices/$pciAddr/remove" || true | |
continue | |
fi | |
# Whole device specified => remove all function | |
for pciAddrFunRm in /sys/bus/pci/devices/$pciAddr.*/remove; do | |
pciAddrFun=$(echo $pciAddrFunRm | grep -o -E '\.[0-9]*') | |
echo "Removing PCI device $pciAddr function $pciAddrFun" | |
echo 1 > "$pciAddrFunRm" || true | |
# This is absolutely required. Attempting to remove one function CAN | |
# remove all of them but it's not instantenous. However, if you hit | |
# such a case and try to manually do /remove on another function while | |
# the first is being removed a "general protection fault" will happen | |
# in the subsequent "pci_stop_and_remove_bus_device_locked()" | |
while [[ -f "$pciAddrFunRm" ]]; do | |
sleep 1 | |
echo "Still waiting for $pciAddrFunRm..." | |
done | |
done | |
done <<< "$vmPciDevices" | |
echo "Re-scanning PCI devices..." | |
echo 1 > /sys/bus/pci/rescan | |
# rescan is asynchronous; if we wanted to be 100% correct here we should wait | |
# for /sys entries to appear, but 2 seconds delay is good enough | |
sleep 2 | |
} | |
# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff | |
# All modifications should be done in post-start as doing them in pre-start will execute them even | |
# if the VM fails to start (and thus post-stop will never be called) | |
case "$runPhase" in | |
pre-start) | |
# Stop idle VM, drop caches & compact memory for hugepages | |
setIdleVm shutdown | |
tidyCaches | |
resetVmPciDevices | |
;; | |
# Designate 2nd CCD (core 6-11, thread 6-11+18-23) to the VM and 1st CCD to host/housekeeping stuff | |
# All modifications should be done in post-start as doing them in pre-start will execute them even | |
# if the VM fails to start (and thus post-stop will never be called) | |
post-start) | |
# This will inform cgroups via systemd to not use 2nd CCX, effectively constaining host to 1st CCX. | |
# This isn't perfect as it will not stop kthreads. "cset" used to mostly work for kthreads (except like docker & | |
# ZFS), but it doesn't work with cgroups v2: https://forum.proxmox.com/threads/cset-failing-pve7.95613/ | |
# I have no idea about any alternatives besides CPU hotplug hack (see below) | |
# WARNING: THIS MUST BE DONE BEFORE ANY OTHER PINNING. Manipulating slice/scope CPU lists will reset | |
# any manual pinning due to a systemd bug/design choice: https://github.com/systemd/systemd/issues/23748 | |
# The "setQemuAllowedCpus" will be overwritten for just this VM by "decoupleQemuVm" later. | |
setHostAllowedCpus "0-5,12-17" | |
setQemuAllowedCpus "0-5,12-17" | |
# Forcefully move all tasks (user space & kthreads) off the 2nd CCX by offlining them temporarily | |
echo "Offlining to-be pinned CPUs to move tasks away..." | |
setCpuStateRange 6 11 0 | |
setCpuStateRange 18 23 0 | |
# Move kernel threads & IRQs away from vCPU threads | |
# Doing this when CPUs are offlined makes it easier as | |
# nothing is running on these CPUs actively | |
pinIrqs "0-5,12-17" | |
pinKthreads "0-5,12-17" | |
# Bring second CCX online - nothing should be scheduled on it due to host & QEMU constrains from above | |
echo "Onlineing to-be pinned CPUs..." | |
setCpuStateRange 6 11 1 | |
setCpuStateRange 18 23 1 | |
# Set frequency scaling to performance mode | |
setGovernorRange 6 11 performance | |
setGovernorRange 18 23 performance | |
# Stats generation causes jitter in VR | |
sysctl vm.stat_interval=120 | |
# Migrate this VM to a separate isolation group (TLDR: see systemd-cgls) | |
# An alternative hacky way to do that would be to iterate over all currently running VMs and | |
# taskset their affinity to 1st CCX, but a new VM starting while this one is running will | |
# break this. So, it's better to isolate the whole qemu.slice with exception of this VM. That | |
# requires the VM process to be moved to a non-qemu.slice | |
decoupleQemuVm | |
# Pin vCPUs to correct threads - this is crucial. | |
# Since SMT/HT is enabled and proper SMT is passed to the guest, the vCPUs need to be pinned | |
# to correct host logical CPUs. QEMU assings vCPUs sequntially; i.e. vCPU0 == 1st thread of | |
# first vCPU, vCPU1 == 2nd thread of first vCPU, vCPU3 == 1st thread of second vCPU etc. | |
# In Linux (at least this one according to lscpu -e) CPU0 is a 1st thread of first core, with | |
# CPU12 being the 2nd/SMT thread of first core. For the 2nd CCX it's a 6+18, 7+19, 8+20, etc | |
# mapping. | |
pinVCpu 0 6 | |
pinVCpu 1 18 | |
pinVCpu 2 7 | |
pinVCpu 3 19 | |
pinVCpu 4 8 | |
pinVCpu 5 20 | |
pinVCpu 6 9 | |
pinVCpu 7 21 | |
pinVCpu 8 10 | |
pinVCpu 9 22 | |
pinVCpu 10 11 | |
pinVCpu 11 23 | |
# Move all QEMU threads (emulator, iothread) of this VM to 1st CCX. This is pretty dumb. IOThread should | |
# probabably be pinned to a single core, but we're counting on host scheduler being smart. | |
# To do static pinning here QMP needs to be used to query types of threads: | |
# https://wiki.qemu.org/Documentation/QMP | |
pinNonVCpuTasks "0-5,12-17" | |
;; | |
pre-stop) | |
;; | |
post-stop) | |
lastCpu=$(getLastCpu) | |
# Allow kthreads, IRQs, host & QEMU to use all CPUs again | |
pinKthreads "0-$lastCpu" | |
pinIrqs "0-$lastCpu" | |
setHostAllowedCpus "0-$lastCpu" | |
setQemuAllowedCpus "0-$lastCpu" | |
# Restore default scaling | |
resetGovernor | |
# Restore default virtual mem stats frequency | |
sysctl vm.stat_interval=1 | |
# Start idle VM | |
resetVmPciDevices | |
setIdleVm start | |
;; | |
*) | |
echo "Unknown run phase \"$runPhase\"!" | |
;; | |
esac | |
echo "Finished $runPhase on VM=$vmId" |
after having it work for a few times it stopped working for some reason now giving this error
Migrating VMID=301 PPID=9671 to scope /sys/fs/cgroup/qemu-decoupled.slice/301.scope /var/lib/vz/snippets/proxmox-hook.sh: line 297: echo: write error: Invalid argument
line 297 is echo $qemuParentPid > "$vmScope/cgroup.procs"
after having it work for a few times it stopped working for some reason now giving this error
Migrating VMID=301 PPID=9671 to scope /sys/fs/cgroup/qemu-decoupled.slice/301.scope /var/lib/vz/snippets/proxmox-hook.sh: line 297: echo: write error: Invalid argument
line 297 is
echo $qemuParentPid > "$vmScope/cgroup.procs"
I'm with the same issue. It's something related do cgroup v2 inner workings.
Did a lot of research but couldn't find any workarounds yet.
Hi quite very complete script... not fully sure. But at the section : Resetting VM PCI devices , is it made as i look what is the card being pass and then find it.. and remove-reset/rescran the card ? I do see a 2 liner that you have to define the card. but if your script can detect and do it direct, it's very nice. Thanks for precision.