Skip to content

Instantly share code, notes, and snippets.

@vadimkantorov
Last active May 17, 2021 23:55
Show Gist options
  • Save vadimkantorov/225148b85863190f873238002ea7fc3b to your computer and use it in GitHub Desktop.
Save vadimkantorov/225148b85863190f873238002ea7fc3b to your computer and use it in GitHub Desktop.
ssh to check nvidia-smi and aggregate by user / machine
#! /usr/bin/python3
# authors: Theophile Dalens, Vadim Kantorov
# first version: https://github.com/willowsierra/office/blob/master/scripts/gpuscan.py
# TODO: resolve usernames with "ldapsearch -x -h ildap -b 'ou=people,dc=inria,dc=fr' -s sub 'inrialogin=kantorov' -LLL mail"
# TODO: implement thresholds
# TODO: prepare suggested e-mail text
import itertools
import argparse
import json
import inspect
import binascii
import pipes
import subprocess
import xml.dom.minidom
import socket
groupby = lambda xs, key: [(k, list(g)) for k, g in itertools.groupby(sorted(xs, key = key), key = key)]
#def count_slots_requested_qstat():
# hostname = lambda queue_name: filter(lambda h: h in queue_name, gpu_hostnames)[0]
# username_hostname = lambda t: (t[1], t[0])
# qstat_xml = xml.dom.minidom.parseString(subprocess.check_output(['qstat', '-u', '*', '-q', gpu_queue, '-xml'])).documentElement
# cluster_usage = [
# (
# hostname(elem.getElementsByTagName('queue_name')[0].firstChild.data),
# elem.getElementsByTagName('JB_owner')[0].firstChild.data,
# int(elem.getElementsByTagName('slots')[0].firstChild.data)
# )
# for elem in qstat_xml.getElementsByTagName('job_list')
# if elem.attributes['state'].value == 'running'
# ]
# by_username_hostname = {k : sum([t[2] for t in g]) for k, g in groupby(cluster_usage, username_hostname)}
# return by_username_hostname
def slot_usage(num_checks, sleep_time):
import itertools
import re
import subprocess
import time
import xml.dom.minidom
def slot_usage_aux():
nvidia_smi_gpus = lambda: xml.dom.minidom.parseString(subprocess.check_output(['nvidia-smi', '-x', '-q']).decode()).documentElement.getElementsByTagName('gpu')
gpupids = [(gpuid, int(elem.getElementsByTagName('pid')[0].firstChild.data)) for gpuid, gpu_elem in enumerate(nvidia_smi_gpus()) for elem in gpu_elem.getElementsByTagName('process_info')]
max_cpu_load, max_gpu_load, username = {}, {}, {}
for t in range(num_checks):
checked = nvidia_smi_gpus()
for gpuid, gpupid in gpupids:
username[gpupid] = subprocess.check_output(['ps', '-o', 'user', '--no-headers', str(gpupid)]).decode().strip()
pid_lists = list(map(lambda s: list(map(int, re.findall('(\d+)', s))), subprocess.check_output(['pstree', '-U', username[gpupid], '-p']).decode().split('\n\n')))
pid_list = [pid for l in pid_lists if gpupid in l for pid in l]
max_cpu_load[gpupid] = max(max_cpu_load.get(gpupid, 0), sum(map(float, subprocess.check_output(['ps', '-o', 'pcpu', '--no-headers'] + list(map(str, pid_list))).decode().split())))
max_gpu_load[gpupid] = max(max_gpu_load.get(gpupid, 0), float(checked[gpuid].getElementsByTagName('gpu_util')[0].firstChild.data.rstrip('%')))
time.sleep(sleep_time)
yield [(gpuid, gpupid, username[gpupid], max_cpu_load[gpupid], max_gpu_load[gpupid]) for gpuid, gpupid in gpupids]
return list(itertools.chain(*list(slot_usage_aux())))
def ssh_slot_usage(hostname, num_checks, sleep_time, localhost = socket.gethostname()):
if hostname == 'localhost' or hostname == localhost or hostname.startswith(localhost + '.'):
return slot_usage(num_checks, sleep_time)
else:
return json.loads(subprocess.check_output(['ssh', hostname, '''python3 -c "import json, binascii; code = binascii.unhexlify(%s).decode(); exec(code); print(json.dumps(slot_usage(%d, %d)))" ''' % (binascii.hexlify(inspect.getsource(slot_usage).encode()), num_checks, sleep_time)]))
def count_slots_used(gpu_hostnames, num_checks, sleep_time):
cluster_usage = []
for hostname in gpu_hostnames:
host_usage = ssh_slot_usage(hostname, num_checks, sleep_time)
cluster_usage += list(map(lambda t: (hostname,) + tuple(t), host_usage))
username_hostname_gpuid = lambda t: (t[3], t[0], t[1])
username_hostname = lambda t: (t[0][0], t[0][1])
by_username_hostname_gpuid = groupby(cluster_usage, username_hostname_gpuid)
by_username_hostname = {k : (len(g), g) for k, g in groupby(by_username_hostname_gpuid, username_hostname)}
return by_username_hostname
def compare_used_requested(slots_used, slots_requested):
all_keys = set(slots_used.keys()) | set(slots_requested.keys())
by_username_hostname = filter(lambda t: t[-2] != t[-1], [k + (slots_used.get(k, (0, [])), slots_requested.get(k, (0, []))) for k in sorted(all_keys)])
return groupby(by_username_hostname, lambda t: t[0])
def print_message(by_user, verbose):
for username, g in by_user:
print(username)
for username, hostname, used_, requested_ in g:
used, requested = used_[0], requested_[0]
if used != requested:
print('\t%s: %d used / %d requested' % (hostname, used, requested))
if verbose:
print('\t\t', [dict(gpuid = v[-5], pid = v[-4], cpuload = v[-2], gpuload = v[-1]) for k, vv in used_[1] for v in vv])
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--num-checks', type = int, default = 1)
parser.add_argument('--sleep-time', type = int, default = 0)
parser.add_argument('--gpu-hostnames', nargs = '*', default = list(map('rack-gamir-g{0:02d}.cs.tau.ac.il'.format, range(9, 13))))
parser.add_argument('--verbose', '-v', action = 'store_true')
args = parser.parse_args()
slots_used = count_slots_used(args.gpu_hostnames, args.num_checks, args.sleep_time)
slots_requested = dict() #count_slots_requested()
by_user = compare_used_requested(slots_used, slots_requested)
print_message(by_user, args.verbose)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment