Skip to content

Instantly share code, notes, and snippets.

@schmmd
Created November 20, 2019 19:08
Show Gist options
  • Save schmmd/1aa445be858ce560d48e13ef2041fea1 to your computer and use it in GitHub Desktop.
Save schmmd/1aa445be858ce560d48e13ef2041fea1 to your computer and use it in GitHub Desktop.
Slurm GPU Stats
#!/usr/bin/env python
import os
import re
import subprocess
import socket
import sys
def pids_of_jid(jid):
result = subprocess.run(["sstat", "-p", "--format=PID", "-j", jid, "--noheader"], stdout=subprocess.PIPE)
pids = result.stdout.decode("utf-8").strip().strip("|").split(",")
return pids
def devices_of_pid(pid):
cuda_visible_devices = []
with open(f"/proc/{pid}/environ") as f:
for line in f.read().split("\0"):
if line.startswith("CUDA_VISIBLE_DEVICES="):
return [int(device) for device in re.sub("CUDA_VISIBLE_DEVICES=", "", line).split(",")]
return []
def devices_of_jid(jid):
# Look up the CUDA_VISIBLE_DEVICES for all pids and make sure they match.
cuda_visible_devices = set()
for pid in pids:
for device in devices_of_pid(pid):
cuda_visible_devices.add(device)
return cuda_visible_devices
def get_jobs():
hostname = socket.gethostname()
result = subprocess.run(["squeue", "--format=%A,%u", "--noheader", "-w", hostname], stdout=subprocess.PIPE)
return [line.split(",") for line in result.stdout.decode("utf-8").splitlines()]
def gpu_utilization():
result = subprocess.run(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total", "--format=csv,noheader"], stdout=subprocess.PIPE)
utilization = []
for line in result.stdout.decode("utf-8").splitlines():
split = [part.strip() for part in line.split(",")]
proc = int(re.sub('[^0-9]', "", split[0]))
memused = int(re.sub('[^0-9]', "", split[1]))
memtotal = int(re.sub('[^0-9]', "", split[2]))
utilization.append([proc, 100 * memused / memtotal])
return utilization
if not 'SUDO_UID' in os.environ.keys():
print("This program requires super user.")
sys.exit(1)
gpu_stats = gpu_utilization()
for jid, user in get_jobs():
pids = pids_of_jid(jid)
cuda_visible_devices = devices_of_jid(jid)
processor = sum([gpu_stats[device][0] for device in cuda_visible_devices]) / len(cuda_visible_devices)
memory = int(sum([gpu_stats[device][1] for device in cuda_visible_devices]) / len(cuda_visible_devices))
print("{} ({}) -> {} (proc={}%, memused={}%)".format(jid, user, cuda_visible_devices, processor, memory))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment