Skip to content

Instantly share code, notes, and snippets.

@gangiman
Created September 10, 2020 01:05
Show Gist options
  • Save gangiman/1309b2b5ea3a8da7b5bd62933366cb92 to your computer and use it in GitHub Desktop.
Save gangiman/1309b2b5ea3a8da7b5bd62933366cb92 to your computer and use it in GitHub Desktop.
A script to find who hoarded all GPU memory on a shared server
#!/usr/bin/env python
import pwd
from time import sleep
from importlib import import_module
from collections import defaultdict
import psutil
import docker
import pandas as pd
class UserRegistry:
def __init__(self, excluded_users=('service', 'test')):
all_pwd_records = pwd.getpwall()
fields = [_field for _field in dir(all_pwd_records[0]) if _field.startswith('pw_')]
list_of_dicts = [
{_field: getattr(_record, _field) for _field in fields}
for _record in all_pwd_records
]
users_df = pd.DataFrame(list_of_dicts)
users_df = users_df.set_index('pw_name')
self.users_df = users_df[users_df.pw_dir.str.startswith('/home') &
users_df.pw_shell.str.endswith('sh') &
~users_df.index.isin(excluded_users)]
def get_process_owner(self, proc_chain):
proc = proc_chain[-1]
username = proc.username()
if username != 'root' and username in self.users_df.index:
print(f"User '{username}' hogs GPU memory with process {proc.pid}")
def get_container_owner(self, client, container_hash, pids):
if isinstance(container_hash, docker.models.containers.Container):
container = container_hash
elif isinstance(container_hash, str):
container = client.containers.get(container_hash)
else:
raise AssertionError("")
print(f"GPU hogging container '{container.attrs['Name'].lstrip('/')}' with pids: {', '.join(map(str, pids))}")
print("Has following mounts:")
mounts_str = ''
for _mount in container.attrs['Mounts']:
_source_dir = _mount['Source']
_dist_dir = _mount['Destination']
user = 'unknown'
for _home_dir in self.users_df.pw_dir:
if _source_dir.startswith(_home_dir):
user = _home_dir.split('/')[-1]
mounts_str += f"\n\t'{_source_dir}': '{_dist_dir}', from homedir of user {user}"
print(mounts_str)
image_name = container.image.attrs['Id'][7:19]
image_author = container.image.attrs['Author']
if container.image.attrs['RepoTags']:
image_name = container.image.attrs['RepoTags'][0]
print(f"Docker image name is: {image_name}, and author is: {image_author}")
class NVML:
def __init__(self):
self.pynvml = import_module('pynvml')
def __enter__(self):
self.pynvml.nvmlInit()
return self
def get_usage(self):
MB = 1024 * 1024
pids = []
print(f"Driver Version: {self.pynvml.nvmlSystemGetDriverVersion()}")
device_count = self.pynvml.nvmlDeviceGetCount()
for i in range(device_count):
handle = self.pynvml.nvmlDeviceGetHandleByIndex(i)
ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
print(f"Device {i}: {self.pynvml.nvmlDeviceGetName(handle)}")
print(f"Device utilisation: {ut.gpu} , memory {ut.memory}")
for nv_process in self.pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
print(f"\tProcess {nv_process.pid} uses {nv_process.usedGpuMemory // MB} MB of GPU memory")
pids.append(nv_process.pid)
return pids
def monitor_gpu_utilisation(self, step=1, limit=20):
device_count = self.pynvml.nvmlDeviceGetCount()
for i in range(limit):
substrings = []
for _gpu_id in range(device_count):
handle = self.pynvml.nvmlDeviceGetHandleByIndex(_gpu_id)
ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
substrings.append(f"load {ut.gpu:3d} , mem {ut.memory:3d}")
print(' | '.join(substrings))
sleep(step)
def __exit__(self, type, value, traceback):
self.pynvml.nvmlShutdown()
def is_containered_process_chain(proc_chain):
cmd = proc_chain[0].cmdline()
if len(cmd) == 1 and cmd[0] == '/usr/bin/containerd':
_proc = proc_chain[1]
_cmd = _proc.cmdline()
assert _cmd[0] == 'containerd-shim'
container_config_path = _cmd[_cmd.index('-workdir')+1]
return container_config_path.split('/')[-1]
else:
return False
def get_proc_chain(pid, verbose=False):
pss = []
while pid != 1:
ps = psutil.Process(pid)
pss.insert(0, ps)
if verbose:
print(f"PID {pid}| u {ps.username()}|cmd: {' '.join(ps.cmdline())}")
pid = ps.ppid()
return pss
def main():
user_registry = UserRegistry()
client = docker.from_env()
print('-'*30 + 'Getting GPU memory usage' + '-'*30)
with NVML() as nvml:
pids = nvml.get_usage()
containers_to_pids_map = defaultdict(list)
gpu_process_chains = []
for _pid in pids:
process_chain = get_proc_chain(_pid, verbose=False)
cont = is_containered_process_chain(process_chain)
if cont:
containers_to_pids_map[cont].append(_pid)
else:
gpu_process_chains.append(process_chain)
print('-' * 30 + 'Determining process owner' + '-' * 30)
for _proc_chain in gpu_process_chains:
user_registry.get_process_owner(_proc_chain)
print('-' * 80)
print('-' * 30 + 'Determining container owner' + '-' * 30)
for _cont_hash, _pids in containers_to_pids_map.items():
user_registry.get_container_owner(client, _cont_hash, _pids)
print('-' * 80)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment