A fun little CLI tool to gather and disiplay GPU utilization for nodes in a cluster.
Inspired by this LinkedIn post.
python gpu_util.py [hostname] [hostname] [etc] ...
A fun little CLI tool to gather and disiplay GPU utilization for nodes in a cluster.
Inspired by this LinkedIn post.
python gpu_util.py [hostname] [hostname] [etc] ...
| #!/usr/bin/env python3 | |
| import subprocess | |
| import argparse | |
| import sys | |
| import xml.etree.ElementTree as etree | |
| import time | |
| import os | |
| APP_WIDTH = 34 | |
| GPUS_PER_ROW = 5 | |
| def print_centered(text: str): | |
| print(f"\033[34m{' '*((APP_WIDTH - len(text)) // 2)}{text}\033[0m") | |
| def remote_nvidia_smi(hostname: str) -> str: | |
| return subprocess.check_output(["ssh", hostname, "nvidia-smi", "-x", "-q"]).decode('utf-8') | |
| def extract_gpu_utilization(xml_string): | |
| for gpu in etree.fromstring(xml_string).findall('gpu'): | |
| yield { | |
| 'name': gpu.find('product_name').text, | |
| 'gpu_util': int(gpu.find('utilization').find('gpu_util').text.replace(' %', '')), | |
| } | |
| def print_gpu_utilization(hosts_data): | |
| all_gpus = [] | |
| total_util = 0 | |
| valid_gpus = 0 | |
| # Clear the screen | |
| os.system('cls' if os.name == 'nt' else 'clear') | |
| # Calculate total utilization and valid GPUs | |
| for utilization_data in hosts_data: | |
| for gpu in utilization_data: | |
| try: | |
| total_util += gpu['gpu_util'] | |
| valid_gpus += 1 | |
| all_gpus.append(f"\033[32m[{gpu['gpu_util']:3d}%]\033[0m") | |
| except (ValueError, AttributeError): | |
| all_gpus.append("[ N/A]") | |
| # Display header | |
| total_gpu_count = len(all_gpus) | |
| print("-"*APP_WIDTH) | |
| print_centered("AI Compute Cluster") | |
| # Get the GPU model name | |
| gpu_models = set(gpu['name'] for utilization_data in hosts_data | |
| for gpu in utilization_data | |
| if gpu['name'] and gpu['name'] != 'N/A') | |
| gpu_model = gpu_models.pop() if len(gpu_models) == 1 else "Mixed GPUs" | |
| print_centered(f"{total_gpu_count} GPUs • {gpu_model} ⚡️") | |
| print_centered("GPU Utilization") | |
| # Display utilization data in grid format (5 GPUs per row) | |
| print("-"*APP_WIDTH) | |
| for i in range(0, len(all_gpus), GPUS_PER_ROW): | |
| row = all_gpus[i:i + GPUS_PER_ROW] | |
| print(" ".join(row)) | |
| # Calculate and display average | |
| print("-"*APP_WIDTH) | |
| avg_util = total_util / valid_gpus | |
| print_centered(f"Average GPU Utilization: \033[32m{avg_util:.1f}%\033[0m") | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("hostnames", nargs="+") | |
| args = parser.parse_args() | |
| while True: | |
| # Get nvidia-smi data for all hostnames using list comprehension | |
| nvidia_smi_data_list = [remote_nvidia_smi(hostname) for hostname in args.hostnames] | |
| # Extract utilization data for all hosts | |
| utilization_data = [list(extract_gpu_utilization(smi_data)) for smi_data in nvidia_smi_data_list] | |
| # Print the utilization data for all hosts | |
| print_gpu_utilization(utilization_data) | |
| # Wait for 1 second before next update | |
| time.sleep(1) | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except KeyboardInterrupt: | |
| sys.exit(0) |