gangiman · September 10, 2020 01:05
diff --git a/blame_the_resource_waster.py b/blame_the_resource_waster.py
 #!/usr/bin/env python

 import pwd
 from time import sleep
 from importlib import import_module
 from collections import defaultdict

 import psutil
 import docker
 import pandas as pd


 class UserRegistry:
    def __init__(self, excluded_users=('service', 'test')):
        all_pwd_records = pwd.getpwall()
        fields = [_field for _field in dir(all_pwd_records[0]) if _field.startswith('pw_')]
        list_of_dicts = [
            {_field: getattr(_record, _field) for _field in fields}
            for _record in all_pwd_records
        ]
        users_df = pd.DataFrame(list_of_dicts)
        users_df = users_df.set_index('pw_name')
        self.users_df = users_df[users_df.pw_dir.str.startswith('/home') &
                                 users_df.pw_shell.str.endswith('sh') &
                                 ~users_df.index.isin(excluded_users)]

    def get_process_owner(self, proc_chain):
        proc = proc_chain[-1]
        username = proc.username()
        if username != 'root' and username in self.users_df.index:
            print(f"User '{username}' hogs GPU memory with process {proc.pid}")

    def get_container_owner(self, client, container_hash, pids):
        if isinstance(container_hash, docker.models.containers.Container):
            container = container_hash
        elif isinstance(container_hash, str):
            container = client.containers.get(container_hash)
        else:
            raise AssertionError("")
        print(f"GPU hogging container '{container.attrs['Name'].lstrip('/')}' with pids: {', '.join(map(str, pids))}")
        print("Has following mounts:")
        mounts_str = ''
        for _mount in container.attrs['Mounts']:
            _source_dir = _mount['Source']
            _dist_dir = _mount['Destination']
            user = 'unknown'
            for _home_dir in self.users_df.pw_dir:
                if _source_dir.startswith(_home_dir):
                    user = _home_dir.split('/')[-1]
            mounts_str += f"\n\t'{_source_dir}': '{_dist_dir}', from homedir of user {user}"
        print(mounts_str)
        image_name = container.image.attrs['Id'][7:19]
        image_author = container.image.attrs['Author']
        if container.image.attrs['RepoTags']:
            image_name = container.image.attrs['RepoTags'][0]
        print(f"Docker image name is: {image_name}, and author is: {image_author}")


 class NVML:
    def __init__(self):
        self.pynvml = import_module('pynvml')

    def __enter__(self):
        self.pynvml.nvmlInit()
        return self

    def get_usage(self):
        MB = 1024 * 1024
        pids = []
        print(f"Driver Version: {self.pynvml.nvmlSystemGetDriverVersion()}")
        device_count = self.pynvml.nvmlDeviceGetCount()
        for i in range(device_count):
            handle = self.pynvml.nvmlDeviceGetHandleByIndex(i)
            ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
            print(f"Device {i}: {self.pynvml.nvmlDeviceGetName(handle)}")
            print(f"Device utilisation: {ut.gpu} , memory {ut.memory}")
            for nv_process in self.pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
                print(f"\tProcess {nv_process.pid} uses {nv_process.usedGpuMemory // MB} MB of GPU memory")
                pids.append(nv_process.pid)
        return pids

    def monitor_gpu_utilisation(self, step=1, limit=20):
        device_count = self.pynvml.nvmlDeviceGetCount()
        for i in range(limit):
            substrings = []
            for _gpu_id in range(device_count):
                handle = self.pynvml.nvmlDeviceGetHandleByIndex(_gpu_id)
                ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
                substrings.append(f"load {ut.gpu:3d} , mem {ut.memory:3d}")
            print(' | '.join(substrings))
            sleep(step)

    def __exit__(self, type, value, traceback):
        self.pynvml.nvmlShutdown()


 def is_containered_process_chain(proc_chain):
    cmd = proc_chain[0].cmdline()
    if len(cmd) == 1 and cmd[0] == '/usr/bin/containerd':
        _proc = proc_chain[1]
        _cmd = _proc.cmdline()
        assert _cmd[0] == 'containerd-shim'
        container_config_path = _cmd[_cmd.index('-workdir')+1]
        return container_config_path.split('/')[-1]
    else:
        return False


 def get_proc_chain(pid, verbose=False):
    pss = []
    while pid != 1:
        ps = psutil.Process(pid)
        pss.insert(0, ps)
        if verbose:
            print(f"PID {pid}| u {ps.username()}|cmd: {' '.join(ps.cmdline())}")
        pid = ps.ppid()
    return pss


 def main():
    user_registry = UserRegistry()
    client = docker.from_env()
    print('-'*30 + 'Getting GPU memory usage' + '-'*30)
    with NVML() as nvml:
        pids = nvml.get_usage()
    containers_to_pids_map = defaultdict(list)
    gpu_process_chains = []
    for _pid in pids:
        process_chain = get_proc_chain(_pid, verbose=False)
        cont = is_containered_process_chain(process_chain)
        if cont:
            containers_to_pids_map[cont].append(_pid)
        else:
            gpu_process_chains.append(process_chain)
    print('-' * 30 + 'Determining process owner' + '-' * 30)
    for _proc_chain in gpu_process_chains:
        user_registry.get_process_owner(_proc_chain)
        print('-' * 80)
    print('-' * 30 + 'Determining container owner' + '-' * 30)
    for _cont_hash, _pids in containers_to_pids_map.items():
        user_registry.get_container_owner(client, _cont_hash, _pids)
        print('-' * 80)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	import pwd
	from time import sleep
	from importlib import import_module
	from collections import defaultdict

	import psutil
	import docker
	import pandas as pd


	class UserRegistry:
	def __init__(self, excluded_users=('service', 'test')):
	all_pwd_records = pwd.getpwall()
	fields = [_field for _field in dir(all_pwd_records[0]) if _field.startswith('pw_')]
	list_of_dicts = [
	{_field: getattr(_record, _field) for _field in fields}
	for _record in all_pwd_records
	]
	users_df = pd.DataFrame(list_of_dicts)
	users_df = users_df.set_index('pw_name')
	self.users_df = users_df[users_df.pw_dir.str.startswith('/home') &
	users_df.pw_shell.str.endswith('sh') &
	~users_df.index.isin(excluded_users)]

	def get_process_owner(self, proc_chain):
	proc = proc_chain[-1]
	username = proc.username()
	if username != 'root' and username in self.users_df.index:
	print(f"User '{username}' hogs GPU memory with process {proc.pid}")

	def get_container_owner(self, client, container_hash, pids):
	if isinstance(container_hash, docker.models.containers.Container):
	container = container_hash
	elif isinstance(container_hash, str):
	container = client.containers.get(container_hash)
	else:
	raise AssertionError("")
	print(f"GPU hogging container '{container.attrs['Name'].lstrip('/')}' with pids: {', '.join(map(str, pids))}")
	print("Has following mounts:")
	mounts_str = ''
	for _mount in container.attrs['Mounts']:
	_source_dir = _mount['Source']
	_dist_dir = _mount['Destination']
	user = 'unknown'
	for _home_dir in self.users_df.pw_dir:
	if _source_dir.startswith(_home_dir):
	user = _home_dir.split('/')[-1]
	mounts_str += f"\n\t'{_source_dir}': '{_dist_dir}', from homedir of user {user}"
	print(mounts_str)
	image_name = container.image.attrs['Id'][7:19]
	image_author = container.image.attrs['Author']
	if container.image.attrs['RepoTags']:
	image_name = container.image.attrs['RepoTags'][0]
	print(f"Docker image name is: {image_name}, and author is: {image_author}")


	class NVML:
	def __init__(self):
	self.pynvml = import_module('pynvml')

	def __enter__(self):
	self.pynvml.nvmlInit()
	return self

	def get_usage(self):
	MB = 1024 * 1024
	pids = []
	print(f"Driver Version: {self.pynvml.nvmlSystemGetDriverVersion()}")
	device_count = self.pynvml.nvmlDeviceGetCount()
	for i in range(device_count):
	handle = self.pynvml.nvmlDeviceGetHandleByIndex(i)
	ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
	print(f"Device {i}: {self.pynvml.nvmlDeviceGetName(handle)}")
	print(f"Device utilisation: {ut.gpu} , memory {ut.memory}")
	for nv_process in self.pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
	print(f"\tProcess {nv_process.pid} uses {nv_process.usedGpuMemory // MB} MB of GPU memory")
	pids.append(nv_process.pid)
	return pids

	def monitor_gpu_utilisation(self, step=1, limit=20):
	device_count = self.pynvml.nvmlDeviceGetCount()
	for i in range(limit):
	substrings = []
	for _gpu_id in range(device_count):
	handle = self.pynvml.nvmlDeviceGetHandleByIndex(_gpu_id)
	ut = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
	substrings.append(f"load {ut.gpu:3d} , mem {ut.memory:3d}")
	print(' \| '.join(substrings))
	sleep(step)

	def __exit__(self, type, value, traceback):
	self.pynvml.nvmlShutdown()


	def is_containered_process_chain(proc_chain):
	cmd = proc_chain[0].cmdline()
	if len(cmd) == 1 and cmd[0] == '/usr/bin/containerd':
	_proc = proc_chain[1]
	_cmd = _proc.cmdline()
	assert _cmd[0] == 'containerd-shim'
	container_config_path = _cmd[_cmd.index('-workdir')+1]
	return container_config_path.split('/')[-1]
	else:
	return False


	def get_proc_chain(pid, verbose=False):
	pss = []
	while pid != 1:
	ps = psutil.Process(pid)
	pss.insert(0, ps)
	if verbose:
	print(f"PID {pid}\| u {ps.username()}\|cmd: {' '.join(ps.cmdline())}")
	pid = ps.ppid()
	return pss


	def main():
	user_registry = UserRegistry()
	client = docker.from_env()
	print('-'30 + 'Getting GPU memory usage' + '-'30)
	with NVML() as nvml:
	pids = nvml.get_usage()
	containers_to_pids_map = defaultdict(list)
	gpu_process_chains = []
	for _pid in pids:
	process_chain = get_proc_chain(_pid, verbose=False)
	cont = is_containered_process_chain(process_chain)
	if cont:
	containers_to_pids_map[cont].append(_pid)
	else:
	gpu_process_chains.append(process_chain)
	print('-' * 30 + 'Determining process owner' + '-' * 30)
	for _proc_chain in gpu_process_chains:
	user_registry.get_process_owner(_proc_chain)
	print('-' * 80)
	print('-' * 30 + 'Determining container owner' + '-' * 30)
	for _cont_hash, _pids in containers_to_pids_map.items():
	user_registry.get_container_owner(client, _cont_hash, _pids)
	print('-' * 80)


	if __name__ == '__main__':
	main()