Created
September 10, 2020 01:05
-
-
Save gangiman/1309b2b5ea3a8da7b5bd62933366cb92 to your computer and use it in GitHub Desktop.
A script to find who hoarded all GPU memory on a shared server
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import pwd | |
from time import sleep | |
from importlib import import_module | |
from collections import defaultdict | |
import psutil | |
import docker | |
import pandas as pd | |
class UserRegistry: | |
def __init__(self, excluded_users=('service', 'test')): | |
all_pwd_records = pwd.getpwall() | |
fields = [_field for _field in dir(all_pwd_records[0]) if _field.startswith('pw_')] | |
list_of_dicts = [ | |
{_field: getattr(_record, _field) for _field in fields} | |
for _record in all_pwd_records | |
] | |
users_df = pd.DataFrame(list_of_dicts) | |
users_df = users_df.set_index('pw_name') | |
self.users_df = users_df[users_df.pw_dir.str.startswith('/home') & | |
users_df.pw_shell.str.endswith('sh') & | |
~users_df.index.isin(excluded_users)] | |
def get_process_owner(self, proc_chain): | |
proc = proc_chain[-1] | |
username = proc.username() | |
if username != 'root' and username in self.users_df.index: | |
print(f"User '{username}' hogs GPU memory with process {proc.pid}") | |
def get_container_owner(self, client, container_hash, pids): | |
if isinstance(container_hash, docker.models.containers.Container): | |
container = container_hash | |
elif isinstance(container_hash, str): | |
container = client.containers.get(container_hash) | |
else: | |
raise AssertionError("") | |
print(f"GPU hogging container '{container.attrs['Name'].lstrip('/')}' with pids: {', '.join(map(str, pids))}") | |
print("Has following mounts:") | |
mounts_str = '' | |
for _mount in container.attrs['Mounts']: | |
_source_dir = _mount['Source'] | |
_dist_dir = _mount['Destination'] | |
user = 'unknown' | |
for _home_dir in self.users_df.pw_dir: | |
if _source_dir.startswith(_home_dir): | |
user = _home_dir.split('/')[-1] | |
mounts_str += f"\n\t'{_source_dir}': '{_dist_dir}', from homedir of user {user}" | |
print(mounts_str) | |
image_name = container.image.attrs['Id'][7:19] | |
image_author = container.image.attrs['Author'] | |
if container.image.attrs['RepoTags']: | |
image_name = container.image.attrs['RepoTags'][0] | |
print(f"Docker image name is: {image_name}, and author is: {image_author}") | |
class NVML: | |
def __init__(self): | |
self.pynvml = import_module('pynvml') | |
def __enter__(self): | |
self.pynvml.nvmlInit() | |
return self | |
def get_usage(self): | |
MB = 1024 * 1024 | |
pids = [] | |
print(f"Driver Version: {self.pynvml.nvmlSystemGetDriverVersion()}") | |
device_count = self.pynvml.nvmlDeviceGetCount() | |
for i in range(device_count): | |
handle = self.pynvml.nvmlDeviceGetHandleByIndex(i) | |
ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle) | |
print(f"Device {i}: {self.pynvml.nvmlDeviceGetName(handle)}") | |
print(f"Device utilisation: {ut.gpu} , memory {ut.memory}") | |
for nv_process in self.pynvml.nvmlDeviceGetComputeRunningProcesses(handle): | |
print(f"\tProcess {nv_process.pid} uses {nv_process.usedGpuMemory // MB} MB of GPU memory") | |
pids.append(nv_process.pid) | |
return pids | |
def monitor_gpu_utilisation(self, step=1, limit=20): | |
device_count = self.pynvml.nvmlDeviceGetCount() | |
for i in range(limit): | |
substrings = [] | |
for _gpu_id in range(device_count): | |
handle = self.pynvml.nvmlDeviceGetHandleByIndex(_gpu_id) | |
ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle) | |
substrings.append(f"load {ut.gpu:3d} , mem {ut.memory:3d}") | |
print(' | '.join(substrings)) | |
sleep(step) | |
def __exit__(self, type, value, traceback): | |
self.pynvml.nvmlShutdown() | |
def is_containered_process_chain(proc_chain): | |
cmd = proc_chain[0].cmdline() | |
if len(cmd) == 1 and cmd[0] == '/usr/bin/containerd': | |
_proc = proc_chain[1] | |
_cmd = _proc.cmdline() | |
assert _cmd[0] == 'containerd-shim' | |
container_config_path = _cmd[_cmd.index('-workdir')+1] | |
return container_config_path.split('/')[-1] | |
else: | |
return False | |
def get_proc_chain(pid, verbose=False): | |
pss = [] | |
while pid != 1: | |
ps = psutil.Process(pid) | |
pss.insert(0, ps) | |
if verbose: | |
print(f"PID {pid}| u {ps.username()}|cmd: {' '.join(ps.cmdline())}") | |
pid = ps.ppid() | |
return pss | |
def main(): | |
user_registry = UserRegistry() | |
client = docker.from_env() | |
print('-'*30 + 'Getting GPU memory usage' + '-'*30) | |
with NVML() as nvml: | |
pids = nvml.get_usage() | |
containers_to_pids_map = defaultdict(list) | |
gpu_process_chains = [] | |
for _pid in pids: | |
process_chain = get_proc_chain(_pid, verbose=False) | |
cont = is_containered_process_chain(process_chain) | |
if cont: | |
containers_to_pids_map[cont].append(_pid) | |
else: | |
gpu_process_chains.append(process_chain) | |
print('-' * 30 + 'Determining process owner' + '-' * 30) | |
for _proc_chain in gpu_process_chains: | |
user_registry.get_process_owner(_proc_chain) | |
print('-' * 80) | |
print('-' * 30 + 'Determining container owner' + '-' * 30) | |
for _cont_hash, _pids in containers_to_pids_map.items(): | |
user_registry.get_container_owner(client, _cont_hash, _pids) | |
print('-' * 80) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment