Last active
August 26, 2024 14:11
-
-
Save guruevi/7d9673c6f44f49b1841eaf49bbd727f9 to your computer and use it in GitHub Desktop.
Proxmox vGPU Hook Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import re | |
import sys | |
def get_available_gpu(vgpu_type): | |
# In /sys/bus/pci/devices/ directory find the next NVIDIA vGPU device | |
# and return the path to it | |
for device in os.listdir('/sys/bus/pci/devices/'): | |
# Check if the device contains an nvidia directory | |
if not os.path.isdir(f'/sys/bus/pci/devices/{device}/nvidia'): | |
continue | |
# Check the current_vgpu_type file to see if it is in use | |
with open(f'/sys/bus/pci/devices/{device}/nvidia/current_vgpu_type') as file: | |
current_vgpu_type = file.read() | |
# If it is in use, continue to the next device | |
if current_vgpu_type != '0\n': | |
continue | |
with open(f'/sys/bus/pci/devices/{device}/nvidia/creatable_vgpu_types') as file: | |
available_vgpu_type = file.read() | |
for line in available_vgpu_type.splitlines(): | |
if vgpu_type in line: | |
print(f'Found available: /sys/bus/pci/devices/{device}') | |
print(f'nVIDIA ID, type: {line}') | |
vgpu_id = line.split(" : ")[0].strip() | |
return device, vgpu_id | |
print("No available NVIDIA vGPU found, are virtual functions enabled? (systemctl start nvidia-sriov)") | |
exit(404) | |
def parse_vgpu_type_id(config): | |
# Define the regular expression pattern | |
pattern = r'(.*)nvidia-(\d+)' | |
# Search for the pattern in the config string | |
match = re.search(pattern, config['tags']) | |
# If a match is found, extract and return the vGPU type ID | |
if match: | |
return match.group(2) | |
return None | |
def parse_vgpu_bus_id(config) -> list: | |
# Define the regular expression pattern | |
pattern = r'-device vfio-pci,sysfsdev=(/sys/bus/pci/devices/[0-9a-fA-F:.]+)' | |
# Search for the pattern in the config string | |
matches = re.findall(pattern, config['args']) | |
# If a match is found, extract and return the vGPU bus ID | |
if not matches: | |
return [] | |
return matches | |
def parse_vm_config(vmid, from_node): | |
config_file = f'/etc/pve/qemu-server/{vmid}.conf' | |
if from_node: | |
config_file = f'/etc/pve/nodes/{from_node}/qemu-server/{vmid}.conf' | |
with open(config_file) as file: | |
config = file.read() | |
# Split each string into a dict | |
config_dict = {} | |
for line in config.splitlines(): | |
key, value = line.split(': ') | |
config_dict[key] = value | |
return config_dict | |
def parse_line_config(config_line, item): | |
line_dict = {} | |
for line in config_line.split(','): | |
key, value = line.split('=') | |
line_dict[key] = value | |
return line_dict.get(item, None) | |
def main(): | |
if len(sys.argv) < 3: | |
print("Usage: script.py <vmid> <phase>") | |
print(" script.py <vmid> get_command <vgpu_name>") | |
sys.exit(1) | |
vmid = sys.argv[1] | |
phase = sys.argv[2] | |
if phase == "get_command": | |
if len(sys.argv) < 4: | |
print("Usage: script.py <vmid> get_command <vgpu_name>") | |
sys.exit(1) | |
vgpu_name = sys.argv[3] | |
# Read the VM config file | |
from_node = os.environ.get("PVE_MIGRATED_FROM", None) | |
config_dict = parse_vm_config(vmid, from_node) | |
if phase == 'get_command': | |
available_vgpu, gpu_id = get_available_gpu(vgpu_name) | |
uuid = parse_line_config(config_dict['smbios1'], 'uuid') | |
print(f"qm set {vmid} --hookscript local:snippets/nvidia_allocator.py") | |
print( | |
f"qm set {vmid} --args \"-device vfio-pci,sysfsdev=/sys/bus/pci/devices/{available_vgpu} -uuid {uuid}\"") | |
tags = set(filter(None, config_dict.get('tags', '').strip().split(';'))) | |
tags.add(f"nvidia-{gpu_id}") | |
print(f"qm set {vmid} --tags \"{';'.join(tags)}\"") | |
sys.exit(0) | |
# Get the vGPU we want from config | |
vgpu_type_id = parse_vgpu_type_id(config_dict) | |
if not vgpu_type_id: | |
# VM doesn't seem to require a GPU | |
sys.exit(0) | |
vgpu_paths = parse_vgpu_bus_id(config_dict) | |
if not vgpu_paths: | |
# No vGPU location specified | |
sys.exit(0) | |
if phase == 'pre-start': | |
# Check if path already exists | |
for vgpu_path in vgpu_paths: | |
if not os.path.exists(vgpu_path): | |
print(f"Specified vGPU not found, rerun the nvidia_allocator get_command or check the drivers: {vgpu_path}") | |
sys.exit(1) | |
stop(vgpu_path) | |
# We break the loop here so that if we misconfigured #2, then we stop before configuring the first | |
for vgpu_path in vgpu_paths: | |
# Write the vgpu_type_id to current_vgpu_type | |
with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file: | |
file.write(vgpu_type_id) | |
# Let Python handle any Exceptions here, so it crashes out with information | |
if phase == 'post-stop': | |
for vgpu_path in vgpu_paths: | |
stop(vgpu_path) | |
def stop(vgpu_path): | |
# Write 0 to current_vgpu_type to indicate that the vGPU is no longer in use | |
try: | |
with open(f'{vgpu_path}/nvidia/current_vgpu_type', 'w') as file: | |
file.write('0') | |
except (FileNotFoundError, PermissionError): | |
# The vGPU path does not exist, so we can ignore this error | |
print("vGPU already de-allocated") | |
if __name__ == "__main__": | |
main() | |
# Make sure we exit with a 0 status code | |
sys.exit(0) |
Thanks for this! I updated it a bit to incorporate python3 features as well as at least on my machine, the extra stop was causing a problem
Found the same issue on my systems and updated it since, see the edit. The extra stop is necessary when the machine crashes (not through the Proxmox system) or the system is migrated and 'post-stop' is never called. So this tries to clean it up before restarting it.
I also made it print the correct sequence of commands, once I verify this always works I may just have an extra option to execute them from the script.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this! I updated it a bit to incorporate python3 features as well as at least on my machine, the extra stop was causing a problem