Skip to content

Instantly share code, notes, and snippets.

@Quentin-Anthony
Created February 4, 2025 18:46
Show Gist options
  • Save Quentin-Anthony/16490ab14f5cdba4c3bdce66d8285beb to your computer and use it in GitHub Desktop.
Save Quentin-Anthony/16490ab14f5cdba4c3bdce66d8285beb to your computer and use it in GitHub Desktop.
Collects system metrics
#!/usr/bin/env python3
import subprocess
import time
import logging
from datetime import datetime
import pynvml
import os
# Configure logging
def setup_logging():
log_dir = "metrics_logs"
if not os.path.exists(log_dir):
os.makedirs(log_dir)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"{log_dir}/metrics_{timestamp}.log"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
return logging.getLogger(__name__)
def get_nvml_metrics():
try:
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
metrics = []
# Get driver version
driver_version = pynvml.nvmlSystemGetDriverVersion()
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
# Basic device info
device_info = {
'device_index': i,
'device_name': pynvml.nvmlDeviceGetName(handle),
'serial': pynvml.nvmlDeviceGetSerial(handle),
'uuid': pynvml.nvmlDeviceGetUUID(handle),
'pci_info': {
'bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busId,
'domain': pynvml.nvmlDeviceGetPciInfo(handle).domain,
'device_id': pynvml.nvmlDeviceGetPciInfo(handle).device,
'bus': pynvml.nvmlDeviceGetPciInfo(handle).bus,
'pci_bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busIdLegacy,
}
}
# Memory info
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
device_info['memory'] = {
'total': memory_info.total,
'free': memory_info.free,
'used': memory_info.used,
}
# Utilization rates
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
device_info['utilization'] = {
'gpu': utilization.gpu,
'memory': utilization.memory
}
# Power usage
try:
device_info['power'] = {
'usage': pynvml.nvmlDeviceGetPowerUsage(handle), # in milliwatts
'limit': pynvml.nvmlDeviceGetEnforcedPowerLimit(handle), # in milliwatts
}
except pynvml.NVMLError:
device_info['power'] = "Not supported"
# Temperature
try:
device_info['temperature'] = {
'gpu': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU), # in Celsius
#'memory': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_MEMORY),
'threshold': pynvml.nvmlDeviceGetTemperatureThreshold(handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN),
}
except pynvml.NVMLError:
device_info['temperature'] = "Not supported"
# Fan speed
try:
device_info['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle) # percentage
except pynvml.NVMLError:
device_info['fan_speed'] = "Not supported"
# Clock speeds
try:
device_info['clocks'] = {
'graphics': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS), # in MHz
'sm': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM),
'memory': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM),
'max_graphics': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS),
'max_sm': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM),
'max_memory': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_MEM),
}
except pynvml.NVMLError:
device_info['clocks'] = "Not supported"
# Performance state
try:
device_info['performance_state'] = pynvml.nvmlDeviceGetPerformanceState(handle)
except pynvml.NVMLError:
device_info['performance_state'] = "Not supported"
# Compute mode
try:
device_info['compute_mode'] = pynvml.nvmlDeviceGetComputeMode(handle)
except pynvml.NVMLError:
device_info['compute_mode'] = "Not supported"
# ECC errors (if supported)
try:
device_info['ecc_errors'] = {
'l1_cache': {
'volatile': {
'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE),
'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE),
},
},
'l2_cache': {
'volatile': {
'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE),
'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE),
},
},
'device_memory': {
'volatile': {
'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY),
'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY),
},
},
}
except pynvml.NVMLError:
device_info['ecc_errors'] = "Not supported"
# Processes
try:
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
device_info['processes'] = [{
'pid': p.pid,
'used_memory': p.usedGpuMemory
} for p in processes]
except pynvml.NVMLError:
device_info['processes'] = "Not supported"
# PCIe throughput
try:
device_info['pcie_throughput'] = {
'tx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_TX_BYTES), # in KB/s
'rx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_RX_BYTES), # in KB/s
}
except pynvml.NVMLError:
device_info['pcie_throughput'] = "Not supported"
# Encoder/Decoder utilization
try:
device_info['encoder_stats'] = pynvml.nvmlDeviceGetEncoderUtilization(handle)
device_info['decoder_stats'] = pynvml.nvmlDeviceGetDecoderUtilization(handle)
except pynvml.NVMLError:
device_info['encoder_stats'] = "Not supported"
device_info['decoder_stats'] = "Not supported"
metrics.append(device_info)
system_info = {
'driver_version': driver_version,
'nvml_version': pynvml.nvmlSystemGetNVMLVersion(),
'cuda_driver_version': pynvml.nvmlSystemGetCudaDriverVersion(),
}
pynvml.nvmlShutdown()
return {
'system': system_info,
'devices': metrics
}
except Exception as e:
return f"Error getting NVML metrics: {str(e)}"
def get_node_exporter_metrics():
try:
# Assuming node_exporter is running on default port 9100
result = subprocess.run(['curl', 'http://localhost:9100/metrics'],
capture_output=True, text=True)
return result.stdout
except Exception as e:
return f"Error getting Node Exporter metrics: {str(e)}"
def get_dcgm_metrics():
try:
# Assuming dcgm-exporter is running on default port 9400
result = subprocess.run(['curl', 'http://localhost:9400/metrics'],
capture_output=True, text=True)
return result.stdout
except Exception as e:
return f"Error getting DCGM metrics: {str(e)}"
def get_nfsstat_metrics():
try:
result = subprocess.run(['nfsstat', '-l'], capture_output=True, text=True)
return result.stdout
except Exception as e:
return f"Error getting NFS stats: {str(e)}"
def get_iostat_metrics():
try:
result = subprocess.run(['iostat', '-x'], capture_output=True, text=True)
return result.stdout
except Exception as e:
return f"Error getting IO stats: {str(e)}"
def get_memory_metrics():
try:
# Memory info
with open('/proc/meminfo', 'r') as f:
meminfo = f.read()
# Page fault info
with open('/proc/vmstat', 'r') as f:
vmstat = f.read()
return {
'meminfo': meminfo,
'vmstat': vmstat
}
except Exception as e:
return f"Error getting memory metrics: {str(e)}"
def collect_metrics(logger):
# Collect all metrics
metrics = {
'timestamp': datetime.now().isoformat(),
'nvml': get_nvml_metrics(),
#'node_exporter': get_node_exporter_metrics(),
#'dcgm': get_dcgm_metrics(),
'nfsstat': get_nfsstat_metrics(),
'iostat': get_iostat_metrics(),
'memory': get_memory_metrics()
}
# Log each metric section separately
for metric_name, metric_data in metrics.items():
logger.info(f"=== {metric_name} ===")
logger.info(metric_data)
logger.info("\n")
def main():
logger = setup_logging()
interval = 60 # Collection interval in seconds
logger.info("Starting metrics collection...")
try:
while True:
collect_metrics(logger)
time.sleep(interval)
except KeyboardInterrupt:
logger.info("Metrics collection stopped by user")
except Exception as e:
logger.error(f"Error in main loop: {str(e)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment