Last active
May 17, 2021 23:55
-
-
Save vadimkantorov/225148b85863190f873238002ea7fc3b to your computer and use it in GitHub Desktop.
ssh to check nvidia-smi and aggregate by user / machine
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python3 | |
# authors: Theophile Dalens, Vadim Kantorov | |
# first version: https://github.com/willowsierra/office/blob/master/scripts/gpuscan.py | |
# TODO: resolve usernames with "ldapsearch -x -h ildap -b 'ou=people,dc=inria,dc=fr' -s sub 'inrialogin=kantorov' -LLL mail" | |
# TODO: implement thresholds | |
# TODO: prepare suggested e-mail text | |
import itertools | |
import argparse | |
import json | |
import inspect | |
import binascii | |
import pipes | |
import subprocess | |
import xml.dom.minidom | |
import socket | |
groupby = lambda xs, key: [(k, list(g)) for k, g in itertools.groupby(sorted(xs, key = key), key = key)] | |
#def count_slots_requested_qstat(): | |
# hostname = lambda queue_name: filter(lambda h: h in queue_name, gpu_hostnames)[0] | |
# username_hostname = lambda t: (t[1], t[0]) | |
# qstat_xml = xml.dom.minidom.parseString(subprocess.check_output(['qstat', '-u', '*', '-q', gpu_queue, '-xml'])).documentElement | |
# cluster_usage = [ | |
# ( | |
# hostname(elem.getElementsByTagName('queue_name')[0].firstChild.data), | |
# elem.getElementsByTagName('JB_owner')[0].firstChild.data, | |
# int(elem.getElementsByTagName('slots')[0].firstChild.data) | |
# ) | |
# for elem in qstat_xml.getElementsByTagName('job_list') | |
# if elem.attributes['state'].value == 'running' | |
# ] | |
# by_username_hostname = {k : sum([t[2] for t in g]) for k, g in groupby(cluster_usage, username_hostname)} | |
# return by_username_hostname | |
def slot_usage(num_checks, sleep_time): | |
import itertools | |
import re | |
import subprocess | |
import time | |
import xml.dom.minidom | |
def slot_usage_aux(): | |
nvidia_smi_gpus = lambda: xml.dom.minidom.parseString(subprocess.check_output(['nvidia-smi', '-x', '-q']).decode()).documentElement.getElementsByTagName('gpu') | |
gpupids = [(gpuid, int(elem.getElementsByTagName('pid')[0].firstChild.data)) for gpuid, gpu_elem in enumerate(nvidia_smi_gpus()) for elem in gpu_elem.getElementsByTagName('process_info')] | |
max_cpu_load, max_gpu_load, username = {}, {}, {} | |
for t in range(num_checks): | |
checked = nvidia_smi_gpus() | |
for gpuid, gpupid in gpupids: | |
username[gpupid] = subprocess.check_output(['ps', '-o', 'user', '--no-headers', str(gpupid)]).decode().strip() | |
pid_lists = list(map(lambda s: list(map(int, re.findall('(\d+)', s))), subprocess.check_output(['pstree', '-U', username[gpupid], '-p']).decode().split('\n\n'))) | |
pid_list = [pid for l in pid_lists if gpupid in l for pid in l] | |
max_cpu_load[gpupid] = max(max_cpu_load.get(gpupid, 0), sum(map(float, subprocess.check_output(['ps', '-o', 'pcpu', '--no-headers'] + list(map(str, pid_list))).decode().split()))) | |
max_gpu_load[gpupid] = max(max_gpu_load.get(gpupid, 0), float(checked[gpuid].getElementsByTagName('gpu_util')[0].firstChild.data.rstrip('%'))) | |
time.sleep(sleep_time) | |
yield [(gpuid, gpupid, username[gpupid], max_cpu_load[gpupid], max_gpu_load[gpupid]) for gpuid, gpupid in gpupids] | |
return list(itertools.chain(*list(slot_usage_aux()))) | |
def ssh_slot_usage(hostname, num_checks, sleep_time, localhost = socket.gethostname()): | |
if hostname == 'localhost' or hostname == localhost or hostname.startswith(localhost + '.'): | |
return slot_usage(num_checks, sleep_time) | |
else: | |
return json.loads(subprocess.check_output(['ssh', hostname, '''python3 -c "import json, binascii; code = binascii.unhexlify(%s).decode(); exec(code); print(json.dumps(slot_usage(%d, %d)))" ''' % (binascii.hexlify(inspect.getsource(slot_usage).encode()), num_checks, sleep_time)])) | |
def count_slots_used(gpu_hostnames, num_checks, sleep_time): | |
cluster_usage = [] | |
for hostname in gpu_hostnames: | |
host_usage = ssh_slot_usage(hostname, num_checks, sleep_time) | |
cluster_usage += list(map(lambda t: (hostname,) + tuple(t), host_usage)) | |
username_hostname_gpuid = lambda t: (t[3], t[0], t[1]) | |
username_hostname = lambda t: (t[0][0], t[0][1]) | |
by_username_hostname_gpuid = groupby(cluster_usage, username_hostname_gpuid) | |
by_username_hostname = {k : (len(g), g) for k, g in groupby(by_username_hostname_gpuid, username_hostname)} | |
return by_username_hostname | |
def compare_used_requested(slots_used, slots_requested): | |
all_keys = set(slots_used.keys()) | set(slots_requested.keys()) | |
by_username_hostname = filter(lambda t: t[-2] != t[-1], [k + (slots_used.get(k, (0, [])), slots_requested.get(k, (0, []))) for k in sorted(all_keys)]) | |
return groupby(by_username_hostname, lambda t: t[0]) | |
def print_message(by_user, verbose): | |
for username, g in by_user: | |
print(username) | |
for username, hostname, used_, requested_ in g: | |
used, requested = used_[0], requested_[0] | |
if used != requested: | |
print('\t%s: %d used / %d requested' % (hostname, used, requested)) | |
if verbose: | |
print('\t\t', [dict(gpuid = v[-5], pid = v[-4], cpuload = v[-2], gpuload = v[-1]) for k, vv in used_[1] for v in vv]) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--num-checks', type = int, default = 1) | |
parser.add_argument('--sleep-time', type = int, default = 0) | |
parser.add_argument('--gpu-hostnames', nargs = '*', default = list(map('rack-gamir-g{0:02d}.cs.tau.ac.il'.format, range(9, 13)))) | |
parser.add_argument('--verbose', '-v', action = 'store_true') | |
args = parser.parse_args() | |
slots_used = count_slots_used(args.gpu_hostnames, args.num_checks, args.sleep_time) | |
slots_requested = dict() #count_slots_requested() | |
by_user = compare_used_requested(slots_used, slots_requested) | |
print_message(by_user, args.verbose) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment