Skip to content

Instantly share code, notes, and snippets.

@DvdGiessen
Created November 15, 2024 14:00
Show Gist options
  • Save DvdGiessen/9bb229e862fcc428989f576e5db3ee0a to your computer and use it in GitHub Desktop.
Save DvdGiessen/9bb229e862fcc428989f576e5db3ee0a to your computer and use it in GitHub Desktop.
QEMU hook for libvirt
#!/bin/bash
set -euo pipefail
# Settings
PCI_DEVICES_PREPARE_WITH_REMOVE_RESCAN=false
PCI_DEVICES_RELEASE_WITH_REMOVE_RESCAN=true
PCI_IOMMU_GROUP_SIBLINGS_BIND_TO_VFIO=false
# To do passthru with the GPU already isolated on boot, I used Ctrl-E in ZFSBootmenu
# to edit the kernel cmdline and added the following two parameters:
# vfio-pci.ids=1002:744c,1002:ab30 systemd.unit=multi-user.target
# Those ID's are my GPU and the HDMI audio for it, found with lspci -nn
# And then used SSH to start the VM through `virsh` with "start <vmname>"
# Check permissions
if [[ "$(id -u)" != '0' ]] ; then
echo >&2 'ERROR: This script must be run as root'
exit 1
fi
# Check dependencies
if ! xmlstarlet --version >/dev/null 2>&1 ; then
echo >&2 'ERROR: xmlstarlet is required to parse the guest definition XML'
exit 1
fi
XMLSTARLET_SANITY_CHECK_OUTPUT="$(echo "<a><b><c x='1' y='2' z='3'/></b><b><c x='4' y='5' z='6'/></b></a>" | xmlstarlet sel -t -m '/a/b/c' -v '@x' -o : -v '@y' -o : -v '@z' -n)"
if [[ $? -ne 0 ]] || [[ "$XMLSTARLET_SANITY_CHECK_OUTPUT" != "$(echo 1:2:3 && echo 4:5:6)" ]] ; then
echo >&2 'ERROR: xmlstarlet did not pass sanity check'
exit 1
fi
# Arguments
if [[ $# -lt 3 ]] ; then
echo >&2 'ERROR: At least 3 arguments must be given'
exit 1
fi
GUEST_NAME="$1"
OPERATION="$2"
STAGE="$3"
# Create directory where we may save state for the current guest
STATE_PATH="/tmp/libvirt-hooks-qemu-$GUEST_NAME"
mkdir -p "$STATE_PATH"
# Read XML from standard input and save it
if [[ -t 0 ]] ; then
echo >&2 'ERROR: Guest definition XML must be passed on stdin'
exit 1
fi
cp /dev/stdin "$STATE_PATH/guest-definition.xml"
if ! xmlstarlet val -e -q "$STATE_PATH/guest-definition.xml" ; then
echo >&2 'ERROR: Guest definition XML failed to validate'
exit 1
fi
# Helper function for retrieving PCI devices from the XML
get_pci_devices() {
PCI_DEVICES="$(xmlstarlet sel -t -m '/domain/devices/hostdev[@type="pci"]/source/address' -v '@domain' -o : -v '@bus' -o : -v '@slot' -o : -v '@function' -n "$STATE_PATH/guest-definition.xml")"
if [[ -n "$PCI_DEVICES" ]] ; then
echo "$PCI_DEVICES" | while IFS=: read -r PCI_DOMAIN PCI_BUS PCI_SLOT PCI_FUNCTION ; do
printf -v PCI_DEVICENAME '%04x:%02x:%02x.%01x' "$PCI_DOMAIN" "$PCI_BUS" "$PCI_SLOT" "$PCI_FUNCTION"
# Check if the PCI device exists
if [[ ! -d "/sys/bus/pci/devices/$PCI_DEVICENAME/" ]] ; then
echo >&2 "WARNING: PCI device $PCI_DEVICENAME specified in the guest definition does not appear to exist!"
continue
fi
echo "$PCI_DEVICENAME"
done | sort -u
fi
}
# Helper function for retrieving non-existant PCI devices from the XML
get_nonexistant_pci_devices() {
PCI_DEVICES="$(xmlstarlet sel -t -m '/domain/devices/hostdev[@type="pci"]/source/address' -v '@domain' -o : -v '@bus' -o : -v '@slot' -o : -v '@function' -n "$STATE_PATH/guest-definition.xml")"
if [[ -n "$PCI_DEVICES" ]] ; then
echo "$PCI_DEVICES" | while IFS=: read -r PCI_DOMAIN PCI_BUS PCI_SLOT PCI_FUNCTION ; do
printf -v PCI_DEVICENAME '%04x:%02x:%02x.%01x' "$PCI_DOMAIN" "$PCI_BUS" "$PCI_SLOT" "$PCI_FUNCTION"
# Check if the PCI device exists
if [[ ! -d "/sys/bus/pci/devices/$PCI_DEVICENAME/" ]] ; then
echo "$PCI_DEVICENAME"
fi
done | sort -u
fi
}
# Helper function for retrieving PCI devices that share a IOMMU group with any of the given devices
get_pci_iommu_group_siblings() {
xargs -I{} ls "/sys/bus/pci/devices/{}/iommu_group/devices" | sort -u
}
# Helper function to find the active user and it's DBUS session address, if any
get_active_user_session_dbus() {
DISPLAY_PIDS="$(pgrep virt-manager || true)"
if [[ -z "$DISPLAY_PIDS" ]] ; then
DISPLAY_PIDS="$(ps U "$(awk 'BEGIN { FS = ":" } { if ($1 == "libvirt") print $4 }' /etc/group)" k -pid o pid= || true)"
fi
if [[ -z "$DISPLAY_PIDS" ]] ; then
DISPLAY_PIDS="$(ps axk -pid o pid= || true)"
fi
echo "$DISPLAY_PIDS" | while read -r DISPLAY_PID ; do
if awk 'BEGIN { RS = "\0" ; FS = "="; d = ""; x = "" } { $2 = substr($0, index($0, "=") + 1) ; if ($1 == "DISPLAY") { d = $2 } ; if ($1 == "XAUTHORITY") { x = $2 } } END { if (d && x) { print d, x } else { exit 1 } }' "/proc/$DISPLAY_PID/environ" >/dev/null 2>&1 ; then
if ( awk '{ if ($1 == "Uid:") print $3 }' "/proc/$DISPLAY_PID/status" | xargs getent passwd | cut -d: -f1 ; echo ' ' ) | tr -d $'\n' && awk 'BEGIN { RS = "\0" ; FS = "=" } { if ($1 == "DBUS_SESSION_BUS_ADDRESS") print substr($0, index($0, "=") + 1) " " }' "/proc/$DISPLAY_PID/environ" 2>/dev/null ; then
break
fi
fi
done
}
# Helper function to find the X display of the active user
get_active_display() {
DISPLAY_PIDS="$(pgrep virt-manager || true)"
if [[ -z "$DISPLAY_PIDS" ]] ; then
DISPLAY_PIDS="$(ps U "$(awk 'BEGIN { FS = ":" } { if ($1 == "libvirt") print $4 }' /etc/group)" k -pid o pid= || true)"
fi
if [[ -z "$DISPLAY_PIDS" ]] ; then
DISPLAY_PIDS="$(ps axk -pid o pid= || true)"
fi
echo "$DISPLAY_PIDS" | while read -r DISPLAY_PID ; do
if awk 'BEGIN { RS = "\0" ; FS = "="; d = ""; x = "" } { if ($1 == "DISPLAY") { d = $2 } ; if ($1 == "XAUTHORITY") { x = $2 } } END { if (d && x) { print d, x } else { exit 1 } }' "/proc/$DISPLAY_PID/environ" 2>/dev/null ; then
break
fi
done
}
# Helper function for showing the confirmation dialog
show_pci_confirmation_dialog() {
PCI_DEVICES="$(< /dev/stdin)"
if ! read -r DIALOG_DISPLAY DIALOG_XAUTHORITY < <(get_active_display || true) || [[ -z "$DIALOG_DISPLAY" || -z "$DIALOG_XAUTHORITY" ]] ; then
echo >&2 "NOTICE: Could not determine DISPLAY or XAUTHORITY for dialog, skipping confirmation."
return
fi
if [[ -x "$(command -v xdpyinfo)" ]] && ! DISPLAY="$DIALOG_DISPLAY" XAUTHORITY="$DIALOG_XAUTHORITY" xdpyinfo >/dev/null 2>&1 ; then
echo >&2 "NOTICE: Autodetected DISPLAY and XAUTHORITY for dialog do not appear to work, skipping confirmation."
return
fi
PCI_NONEXISTANT_DEVICES="$(get_nonexistant_pci_devices)"
PCI_IOMMU_GROUP_SIBLINGS="$(comm -13 <(echo "$PCI_DEVICES") <(echo "$PCI_DEVICES" | get_pci_iommu_group_siblings))"
DIALOG_TITLE='Unbind devices from host?'
DIALOG_TEXT="Powering on QEMU guest '$GUEST_NAME' will attempt to move $(echo "$PCI_DEVICES" | wc -l) PCI device$(echo "$PCI_DEVICES" | awk 'END { if (NR != 1) print "s" }') from your host system to the guest:\n"
DIALOG_TEXT="${DIALOG_TEXT}\n$(echo "$PCI_DEVICES" | xargs -n1 lspci -Dnnks)\n\n"
if [[ -n "$PCI_IOMMU_GROUP_SIBLINGS" ]] ; then
DIALOG_TEXT="${DIALOG_TEXT}Additionally, the following PCI devices share a IOMMU group with the above and will be detached from the host system:\n$(echo "$PCI_IOMMU_GROUP_SIBLINGS" | xargs -n1 lspci -Dnnks)\n\n"
fi
if [[ -n "$PCI_NONEXISTANT_DEVICES" ]] ; then
DIALOG_TEXT="${DIALOG_TEXT}The following PCI devices were specified in the configuration but do not exist and will be skipped:\n$PCI_NONEXISTANT_DEVICES\n\n"
fi
DIALOG_TEXT="${DIALOG_TEXT}This may make your system unresponsive."
if [[ -x "$(command -v zenity)" ]] ; then
DISPLAY="$DIALOG_DISPLAY" XAUTHORITY="$DIALOG_XAUTHORITY" zenity --question --title="$DIALOG_TITLE" --text="$DIALOG_TEXT" --ok-label='Continue' --cancel-label='Cancel'
elif [[ -x "$(command -v kdialog)" ]] ; then
DISPLAY="$DIALOG_DISPLAY" XAUTHORITY="$DIALOG_XAUTHORITY" kdialog --title "$DIALOG_TITLE" --warningcontinuecancel "$DIALOG_TEXT"
else
echo >&2 "NOTICE: No supported GUI dialog program found, skipping confirmation."
fi
}
if [[ "$OPERATION" == 'prepare' && "$STAGE" == 'begin' ]] ; then
# Ensure bridging rules are applied
if [[ -x "/usr/lib/systemd/systemd-sysctl" ]] ; then
/usr/lib/systemd/systemd-sysctl --prefix=net.bridge
fi
# Parse the XML to obtain a list of passed through PCI devices
PCI_DEVICES="$(get_pci_devices)"
if [[ -n "$PCI_DEVICES" ]] ; then
# Ask the user to confirm
if ! echo "$PCI_DEVICES" | show_pci_confirmation_dialog ; then
echo >&2 "ERROR: User aborted starting VM with PCI passthrough."
exit 1
fi
# These steps *should* not be needed, but it turns out Wayland/KDE does not like losing it's GPU and the desktop misbehaves a lot once we return
# Force logout the user
if read -r ACTIVE_USER ACTIVE_USER_DBUS_ADDRESS < <(get_active_user_session_dbus || true) && [[ -n "$ACTIVE_USER" && -n "$ACTIVE_USER_DBUS_ADDRESS" ]] ; then
if sudo -u "$ACTIVE_USER" qdbus --bus "$ACTIVE_USER_DBUS_ADDRESS" org.kde.Shutdown /Shutdown org.kde.Shutdown.logout ; then
sleep 5
fi
fi
# Stop display manager
if systemctl isolate multi-user.target ; then
sleep 1
fi
# Unbind graphics drivers from the virtual consoles
for VIRTUAL_CONSOLE in "/sys/class/vtconsole/"* ; do
if grep -qE '^\(M\) ' "$VIRTUAL_CONSOLE/name" ; then
echo 0 > "$VIRTUAL_CONSOLE/bind"
fi
done
# Unbind EFI framebuffer if not already done by kernel cmdline
if ! grep -qE '(^| )video=efifb:off( |$)' /proc/cmdline ; then
echo efi-framebuffer.0 > /sys/bus/platform/drivers/efi-framebuffer/unbind
fi
# Make sure the required VFIO kernel modules are loaded
modprobe vfio
modprobe vfio_iommu_type1
modprobe vfio_pci
# Create directory for saving the PCI driver state
mkdir -p "$STATE_PATH/pci-ids" "$STATE_PATH/pci-drivers"
# Loop over PCI host devices defined for this guest as well as its IOMMU group siblings
echo "$PCI_DEVICES" | get_pci_iommu_group_siblings | while read -r PCI_DEVICENAME ; do
# Check if this device itself is passed through or whether it is just a sibling
PCI_DEVICE_IS_IOMMU_GROUP_SIBLING=true
grep -qx "$PCI_DEVICENAME" <(echo "$PCI_DEVICES") && PCI_DEVICE_IS_IOMMU_GROUP_SIBLING=false
# Check if the PCI device exists
if [[ ! -d "/sys/bus/pci/devices/$PCI_DEVICENAME/" ]] ; then
if $PCI_DEVICE_IS_IOMMU_GROUP_SIBLING ; then
echo >&2 "WARNING: PCI IOMMU group sibling $PCI_DEVICENAME does not appear to exist!"
else
echo >&2 "WARNING: PCI device $PCI_DEVICENAME specified in the guest definition does not appear to exist!"
fi
continue
fi
# Determine the currently bound driver
PCI_DRIVER_CURRENT=""
if [[ -d "/sys/bus/pci/devices/$PCI_DEVICENAME/driver" ]] ; then
PCI_DRIVER_CURRENT="$(basename "$(readlink "/sys/bus/pci/devices/$PCI_DEVICENAME/driver")")"
fi
# Never modify PCI-e (root) ports
if [[ "$PCI_DRIVER_CURRENT" == 'pcieport' ]] ; then
continue
fi
# Write the current bound driver name to the state directory so we can try to restore it later
echo "$PCI_DRIVER_CURRENT" > "$STATE_PATH/pci-drivers/$PCI_DEVICENAME"
# Determine the vendor and device IDs
PCI_DEVICE_ID=""
if [[ -r "/sys/bus/pci/devices/$PCI_DEVICENAME/vendor" && -r "/sys/bus/pci/devices/$PCI_DEVICENAME/device" ]] ; then
printf -v PCI_DEVICE_ID '%04x %04x' "$(< "/sys/bus/pci/devices/$PCI_DEVICENAME/vendor")" "$(< "/sys/bus/pci/devices/$PCI_DEVICENAME/device")"
fi
# Write the vendor and device IDs to the state directory so we can retrieve these later
echo "$PCI_DEVICE_ID" > "$STATE_PATH/pci-ids/$PCI_DEVICENAME"
# Forcing binding to VFIO is optional for sibling devices
if ! $PCI_DEVICE_IS_IOMMU_GROUP_SIBLING || $PCI_IOMMU_GROUP_SIBLINGS_BIND_TO_VFIO ; then
# If the device is already bound to VFIO, skip it
if [[ "$PCI_DRIVER_CURRENT" == 'vfio-pci' ]] ; then
if $PCI_DEVICE_IS_IOMMU_GROUP_SIBLING ; then
echo >&2 "NOTICE: PCI IOMMU group sibling $PCI_DEVICENAME is already bound to the $PCI_DRIVER_CURRENT driver, skipping."
else
echo >&2 "NOTICE: PCI device $PCI_DEVICENAME is already bound to the $PCI_DRIVER_CURRENT driver, skipping."
fi
continue
fi
# Prevent other drivers from binding to the device
echo vfio-pci > "/sys/bus/pci/devices/$PCI_DEVICENAME/driver_override"
else
# If the device is already unbound, skip it
if [[ "$PCI_DRIVER_CURRENT" == '' ]] ; then
echo >&2 "NOTICE: PCI IOMMU group sibling $PCI_DEVICENAME is already unbound, skipping."
continue
fi
# Prevent any driver from binding to the device
echo none > "/sys/bus/pci/devices/$PCI_DEVICENAME/driver_override"
fi
# Unbind the current driver
if [[ -n "$PCI_DRIVER_CURRENT" ]] ; then
echo "$PCI_DEVICENAME" > "/sys/bus/pci/devices/$PCI_DEVICENAME/driver/unbind"
sleep 1
fi
# Binding to VFIO is optional for sibling devices
if ! $PCI_DEVICE_IS_IOMMU_GROUP_SIBLING || $PCI_IOMMU_GROUP_SIBLINGS_BIND_TO_VFIO ; then
# Should we remove and rescan, or rebind it immediately?
if $PCI_DEVICES_PREPARE_WITH_REMOVE_RESCAN && [[ -n "$PCI_DEVICE_ID" ]] ; then
# Instead of unbinding the driver, lets instead just remove the device and rediscover it by rescanning
echo 1 > "/sys/bus/pci/devices/$PCI_DEVICENAME/remove"
sleep 1
# Add the device vendor and product IDs to the list of IDs for the VFIO driver
echo "$PCI_DEVICE_ID" > /sys/bus/pci/drivers/vfio-pci/new_id
else
# Bind the VFIO driver
echo "$PCI_DEVICENAME" > /sys/bus/pci/drivers/vfio-pci/bind
fi
fi
done
# Rescan all PCI devices to rediscover them
if $PCI_DEVICES_PREPARE_WITH_REMOVE_RESCAN ; then
echo 1 > "/sys/bus/pci/rescan"
sleep 1
fi
fi
# Prevent sleep while this guest is running
if [[ -x "$(command -v systemctl)" ]] && systemctl cat [email protected] >/dev/null 2>&1 ; then
systemctl start "libvirt-nosleep@$GUEST_NAME"
fi
elif [[ "$OPERATION" == 'release' && "$STAGE" == 'end' ]] ; then
# Allow sleeping again now this guest is stopped
if [[ -x "$(command -v systemctl)" ]] && systemctl cat [email protected] >/dev/null 2>&1 ; then
systemctl stop "libvirt-nosleep@$GUEST_NAME"
fi
# Loop over PCI host devices defined for this guest
PCI_DEVICES="$(get_pci_devices)"
if [[ -n "$PCI_DEVICES" ]] ; then
echo "$PCI_DEVICES" | get_pci_iommu_group_siblings | while read -r PCI_DEVICENAME ; do
# Check if this device itself is passed through or whether it is just a sibling
PCI_DEVICE_IS_IOMMU_GROUP_SIBLING=true
grep -qx "$PCI_DEVICENAME" <(echo "$PCI_DEVICES") && PCI_DEVICE_IS_IOMMU_GROUP_SIBLING=false
# Check if the PCI device exists
if [[ ! -d "/sys/bus/pci/devices/$PCI_DEVICENAME/" ]] ; then
if $PCI_DEVICE_IS_IOMMU_GROUP_SIBLING ; then
echo >&2 "WARNING: PCI IOMMU group sibling $PCI_DEVICENAME does not appear to exist!"
else
echo >&2 "WARNING: PCI device $PCI_DEVICENAME specified in the guest definition does not appear to exist!"
fi
continue
fi
# Determine the currently bound driver
PCI_DRIVER_CURRENT=""
if [[ -d "/sys/bus/pci/devices/$PCI_DEVICENAME/driver" ]] ; then
PCI_DRIVER_CURRENT="$(basename "$(readlink "/sys/bus/pci/devices/$PCI_DEVICENAME/driver")")"
fi
# Never modify PCI-e (root) ports
if [[ "$PCI_DRIVER_CURRENT" == 'pcieport' ]] ; then
continue
fi
# Retrieve the PCI device vendor and device IDs from the state directory and then remove the state file
PCI_DEVICE_ID=""
if [[ -r "$STATE_PATH/pci-ids/$PCI_DEVICENAME" ]] ; then
PCI_DEVICE_ID="$(< "$STATE_PATH/pci-ids/$PCI_DEVICENAME")"
fi
rm -f "$STATE_PATH/pci-ids/$PCI_DEVICENAME"
# Retrieve the previously bound driver name from the state directory and then remove the state file
PCI_DRIVER_PREVIOUS_KNOWN=false
PCI_DRIVER_PREVIOUS=""
if [[ -r "$STATE_PATH/pci-drivers/$PCI_DEVICENAME" ]] ; then
PCI_DRIVER_PREVIOUS_KNOWN=true
PCI_DRIVER_PREVIOUS="$(< "$STATE_PATH/pci-drivers/$PCI_DEVICENAME")"
fi
rm -f "$STATE_PATH/pci-drivers/$PCI_DEVICENAME"
# If the device is already using another driver
if [[ -n "$PCI_DRIVER_CURRENT" && "$PCI_DRIVER_CURRENT" != 'vfio-pci' ]] ; then
echo >&2 "NOTICE: PCI device $PCI_DEVICENAME is already bound to the $PCI_DRIVER_CURRENT driver, skipping."
continue
fi
# Check if we should be unbinding the VFIO driver
if [[ "$PCI_DRIVER_PREVIOUS" == 'vfio-pci' ]] ; then
echo >&2 "NOTICE: PCI device $PCI_DEVICENAME was previously bound to the $PCI_DRIVER_PREVIOUS driver, skipping."
continue
fi
# Allow all drivers to bind to the device
echo > "/sys/bus/pci/devices/$PCI_DEVICENAME/driver_override"
# Should we remove and rescan, or rebind it immediately?
if $PCI_DEVICES_RELEASE_WITH_REMOVE_RESCAN ; then
# Remove the device vendor and product IDs from the list of IDs for the VFIO driver
if $PCI_DEVICES_PREPARE_WITH_REMOVE_RESCAN && [[ -n "$PCI_DEVICE_ID" ]] ; then
echo "$PCI_DEVICE_ID" > /sys/bus/pci/drivers/vfio-pci/remove_id
fi
# Instead of unbinding the driver, lets instead just remove the device and rediscover it by rescanning
echo 1 > "/sys/bus/pci/devices/$PCI_DEVICENAME/remove"
sleep 1
else
# Unbind the VFIO driver
if [[ -n "$PCI_DRIVER_CURRENT" ]] ; then
echo "$PCI_DEVICENAME" > "/sys/bus/pci/devices/$PCI_DEVICENAME/driver/unbind"
fi
# Check if the previously bound driver is a valid one
if ! "$PCI_DRIVER_PREVIOUS_KNOWN" ; then
echo >&2 "WARNING: Previous bound driver for PCI device $PCI_DEVICENAME is not known, skipping."
continue
fi
if [[ -z "$PCI_DRIVER_PREVIOUS" ]] ; then
echo >&2 "NOTICE: PCI device $PCI_DEVICENAME did not previously have a driver bound, skipping."
continue
fi
# Make sure the driver we are about to bind is loaded
if ! modprobe "$PCI_DRIVER_PREVIOUS" || [[ ! -d "/sys/bus/pci/drivers/$PCI_DRIVER_PREVIOUS" ]] ; then
echo >&2 "ERROR: The driver that PCI device $PCI_DEVICENAME was previously bound to ($PCI_DRIVER_PREVIOUS) does not exist, skipping."
continue
fi
# Bind the previously bound driver
echo "$PCI_DEVICENAME" > "/sys/bus/pci/drivers/$PCI_DRIVER_PREVIOUS/bind"
fi
done
# Rescan all PCI devices to rediscover them
if $PCI_DEVICES_RELEASE_WITH_REMOVE_RESCAN ; then
echo 1 > "/sys/bus/pci/rescan"
sleep 1
fi
# Revert steps that should not have been needed
sleep 1
# Rebind graphics drivers to the virtual consoles
for VIRTUAL_CONSOLE in "/sys/class/vtconsole/"* ; do
if grep -qE '^\(M\) ' "$VIRTUAL_CONSOLE/name" ; then
echo 1 > "$VIRTUAL_CONSOLE/bind"
fi
done
# Rebind EFI framebuffer
if ! grep -qE '(^| )video=efifb:off( |$)' /proc/cmdline ; then
echo "efi-framebuffer.0" > /sys/bus/platform/drivers/efi-framebuffer/bind
fi
# Restart Display Manager
systemctl isolate graphical.target
fi
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment