Created
May 6, 2025 20:39
-
-
Save thesues/3c216d5b8174cfccfc2d62d94030a7e2 to your computer and use it in GitHub Desktop.
dcgm.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import subprocess | |
import time | |
import signal | |
import sys | |
import math | |
import matplotlib.pyplot as plt | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Monitor DCGM counters for one or more GPUs at 1 Hz and plot them on exit." | |
) | |
parser.add_argument( | |
"--gpus", type=int, nargs='+', default=[0], | |
help="List of GPU IDs to monitor (e.g. --gpus 0 1 2)" | |
) | |
args = parser.parse_args() | |
# Fixed sampling interval: 1000 ms (1 Hz) | |
interval_ms = 1000 | |
# DCGM field IDs: | |
field_ids = ["1002", "1003", "1005"] # sm_active, sm_occupancy, dram_active | |
# Launch dcgmi dmon streaming all GPUs, filter in Python | |
cmd = [ | |
"dcgmi", "dmon", | |
"-e", ",".join(field_ids), | |
"-d", str(interval_ms) | |
] | |
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True, bufsize=1) | |
headers = None | |
data = {} # gpu_id -> { header: [values] } | |
times = {} # gpu_id -> [timestamps] | |
initialized = False | |
def handle_sigint(signum, frame): | |
proc.terminate() | |
signal.signal(signal.SIGINT, handle_sigint) | |
try: | |
for line in proc.stdout: | |
line = line.strip() | |
if not line: | |
continue | |
# Parse header line (starts with '#') | |
if line.startswith("#"): | |
parts = line.split() | |
# Remove tokens '#Entity' and 'ID', keep metric names | |
clean = [p.lstrip('#') for p in parts] | |
headers = [p for p in clean if p.upper() not in ("ENTITY","ID")] | |
print("Monitoring fields:", ", ".join(headers)) | |
continue | |
parts = line.split() | |
# Skip non-data lines | |
if headers is None or len(parts) < 2 + len(headers): | |
continue | |
# Parse GPU ID | |
try: | |
gpu_id = int(parts[1]) | |
except ValueError: | |
continue | |
if gpu_id not in args.gpus: | |
continue | |
# Initialize data structures on first data line | |
if not initialized: | |
for gid in args.gpus: | |
data[gid] = {h: [] for h in headers} | |
times[gid] = [] | |
initialized = True | |
# Record timestamp | |
t = time.time() | |
times[gpu_id].append(t) | |
# Extract and store each metric (handle 'N/A') | |
vals = parts[2:2 + len(headers)] | |
out_strs = [] | |
for h, v in zip(headers, vals): | |
try: | |
f = float(v) | |
out_strs.append(f"GPU {gpu_id} {h.lower()}={f:.3f}") | |
except ValueError: | |
f = math.nan | |
out_strs.append(f"GPU {gpu_id} {h.lower()}=N/A") | |
data[gpu_id][h].append(f) | |
print(", ".join(out_strs)) | |
finally: | |
if not initialized: | |
print("No data collected. Exiting.") | |
sys.exit(1) | |
# Plot for each GPU | |
for gid in args.gpus: | |
ts = times.get(gid, []) | |
if not ts: | |
print(f"No samples for GPU {gid}, skipping plot.") | |
continue | |
t0 = ts[0] | |
rel = [t - t0 for t in ts] | |
plt.figure(figsize=(8, 5)) | |
for h in headers: | |
plt.plot(rel, data[gid][h], label=h.lower()) | |
plt.xlabel("Time (s)") | |
plt.ylabel("Percentage (%)") | |
plt.title(f"GPU {gid} DCGM Metrics at 1 Hz") | |
plt.legend() | |
plt.tight_layout() | |
fname = f"gpu_{gid}_metrics.png" | |
plt.savefig(fname) | |
print(f"Saved plot for GPU {gid} to {fname}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment