Skip to content

Instantly share code, notes, and snippets.

@thesues
Created May 6, 2025 20:39
Show Gist options
  • Save thesues/3c216d5b8174cfccfc2d62d94030a7e2 to your computer and use it in GitHub Desktop.
Save thesues/3c216d5b8174cfccfc2d62d94030a7e2 to your computer and use it in GitHub Desktop.
dcgm.py
#!/usr/bin/env python3
import argparse
import subprocess
import time
import signal
import sys
import math
import matplotlib.pyplot as plt
def main():
parser = argparse.ArgumentParser(
description="Monitor DCGM counters for one or more GPUs at 1 Hz and plot them on exit."
)
parser.add_argument(
"--gpus", type=int, nargs='+', default=[0],
help="List of GPU IDs to monitor (e.g. --gpus 0 1 2)"
)
args = parser.parse_args()
# Fixed sampling interval: 1000 ms (1 Hz)
interval_ms = 1000
# DCGM field IDs:
field_ids = ["1002", "1003", "1005"] # sm_active, sm_occupancy, dram_active
# Launch dcgmi dmon streaming all GPUs, filter in Python
cmd = [
"dcgmi", "dmon",
"-e", ",".join(field_ids),
"-d", str(interval_ms)
]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True, bufsize=1)
headers = None
data = {} # gpu_id -> { header: [values] }
times = {} # gpu_id -> [timestamps]
initialized = False
def handle_sigint(signum, frame):
proc.terminate()
signal.signal(signal.SIGINT, handle_sigint)
try:
for line in proc.stdout:
line = line.strip()
if not line:
continue
# Parse header line (starts with '#')
if line.startswith("#"):
parts = line.split()
# Remove tokens '#Entity' and 'ID', keep metric names
clean = [p.lstrip('#') for p in parts]
headers = [p for p in clean if p.upper() not in ("ENTITY","ID")]
print("Monitoring fields:", ", ".join(headers))
continue
parts = line.split()
# Skip non-data lines
if headers is None or len(parts) < 2 + len(headers):
continue
# Parse GPU ID
try:
gpu_id = int(parts[1])
except ValueError:
continue
if gpu_id not in args.gpus:
continue
# Initialize data structures on first data line
if not initialized:
for gid in args.gpus:
data[gid] = {h: [] for h in headers}
times[gid] = []
initialized = True
# Record timestamp
t = time.time()
times[gpu_id].append(t)
# Extract and store each metric (handle 'N/A')
vals = parts[2:2 + len(headers)]
out_strs = []
for h, v in zip(headers, vals):
try:
f = float(v)
out_strs.append(f"GPU {gpu_id} {h.lower()}={f:.3f}")
except ValueError:
f = math.nan
out_strs.append(f"GPU {gpu_id} {h.lower()}=N/A")
data[gpu_id][h].append(f)
print(", ".join(out_strs))
finally:
if not initialized:
print("No data collected. Exiting.")
sys.exit(1)
# Plot for each GPU
for gid in args.gpus:
ts = times.get(gid, [])
if not ts:
print(f"No samples for GPU {gid}, skipping plot.")
continue
t0 = ts[0]
rel = [t - t0 for t in ts]
plt.figure(figsize=(8, 5))
for h in headers:
plt.plot(rel, data[gid][h], label=h.lower())
plt.xlabel("Time (s)")
plt.ylabel("Percentage (%)")
plt.title(f"GPU {gid} DCGM Metrics at 1 Hz")
plt.legend()
plt.tight_layout()
fname = f"gpu_{gid}_metrics.png"
plt.savefig(fname)
print(f"Saved plot for GPU {gid} to {fname}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment