Created
February 4, 2025 18:46
-
-
Save Quentin-Anthony/16490ab14f5cdba4c3bdce66d8285beb to your computer and use it in GitHub Desktop.
Collects system metrics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import subprocess | |
import time | |
import logging | |
from datetime import datetime | |
import pynvml | |
import os | |
# Configure logging | |
def setup_logging(): | |
log_dir = "metrics_logs" | |
if not os.path.exists(log_dir): | |
os.makedirs(log_dir) | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
log_file = f"{log_dir}/metrics_{timestamp}.log" | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(message)s', | |
handlers=[ | |
logging.FileHandler(log_file), | |
logging.StreamHandler() | |
] | |
) | |
return logging.getLogger(__name__) | |
def get_nvml_metrics(): | |
try: | |
pynvml.nvmlInit() | |
device_count = pynvml.nvmlDeviceGetCount() | |
metrics = [] | |
# Get driver version | |
driver_version = pynvml.nvmlSystemGetDriverVersion() | |
for i in range(device_count): | |
handle = pynvml.nvmlDeviceGetHandleByIndex(i) | |
# Basic device info | |
device_info = { | |
'device_index': i, | |
'device_name': pynvml.nvmlDeviceGetName(handle), | |
'serial': pynvml.nvmlDeviceGetSerial(handle), | |
'uuid': pynvml.nvmlDeviceGetUUID(handle), | |
'pci_info': { | |
'bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busId, | |
'domain': pynvml.nvmlDeviceGetPciInfo(handle).domain, | |
'device_id': pynvml.nvmlDeviceGetPciInfo(handle).device, | |
'bus': pynvml.nvmlDeviceGetPciInfo(handle).bus, | |
'pci_bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busIdLegacy, | |
} | |
} | |
# Memory info | |
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) | |
device_info['memory'] = { | |
'total': memory_info.total, | |
'free': memory_info.free, | |
'used': memory_info.used, | |
} | |
# Utilization rates | |
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) | |
device_info['utilization'] = { | |
'gpu': utilization.gpu, | |
'memory': utilization.memory | |
} | |
# Power usage | |
try: | |
device_info['power'] = { | |
'usage': pynvml.nvmlDeviceGetPowerUsage(handle), # in milliwatts | |
'limit': pynvml.nvmlDeviceGetEnforcedPowerLimit(handle), # in milliwatts | |
} | |
except pynvml.NVMLError: | |
device_info['power'] = "Not supported" | |
# Temperature | |
try: | |
device_info['temperature'] = { | |
'gpu': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU), # in Celsius | |
#'memory': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_MEMORY), | |
'threshold': pynvml.nvmlDeviceGetTemperatureThreshold(handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN), | |
} | |
except pynvml.NVMLError: | |
device_info['temperature'] = "Not supported" | |
# Fan speed | |
try: | |
device_info['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle) # percentage | |
except pynvml.NVMLError: | |
device_info['fan_speed'] = "Not supported" | |
# Clock speeds | |
try: | |
device_info['clocks'] = { | |
'graphics': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS), # in MHz | |
'sm': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM), | |
'memory': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM), | |
'max_graphics': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS), | |
'max_sm': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM), | |
'max_memory': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_MEM), | |
} | |
except pynvml.NVMLError: | |
device_info['clocks'] = "Not supported" | |
# Performance state | |
try: | |
device_info['performance_state'] = pynvml.nvmlDeviceGetPerformanceState(handle) | |
except pynvml.NVMLError: | |
device_info['performance_state'] = "Not supported" | |
# Compute mode | |
try: | |
device_info['compute_mode'] = pynvml.nvmlDeviceGetComputeMode(handle) | |
except pynvml.NVMLError: | |
device_info['compute_mode'] = "Not supported" | |
# ECC errors (if supported) | |
try: | |
device_info['ecc_errors'] = { | |
'l1_cache': { | |
'volatile': { | |
'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE), | |
'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE), | |
}, | |
}, | |
'l2_cache': { | |
'volatile': { | |
'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE), | |
'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE), | |
}, | |
}, | |
'device_memory': { | |
'volatile': { | |
'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY), | |
'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY), | |
}, | |
}, | |
} | |
except pynvml.NVMLError: | |
device_info['ecc_errors'] = "Not supported" | |
# Processes | |
try: | |
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) | |
device_info['processes'] = [{ | |
'pid': p.pid, | |
'used_memory': p.usedGpuMemory | |
} for p in processes] | |
except pynvml.NVMLError: | |
device_info['processes'] = "Not supported" | |
# PCIe throughput | |
try: | |
device_info['pcie_throughput'] = { | |
'tx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_TX_BYTES), # in KB/s | |
'rx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_RX_BYTES), # in KB/s | |
} | |
except pynvml.NVMLError: | |
device_info['pcie_throughput'] = "Not supported" | |
# Encoder/Decoder utilization | |
try: | |
device_info['encoder_stats'] = pynvml.nvmlDeviceGetEncoderUtilization(handle) | |
device_info['decoder_stats'] = pynvml.nvmlDeviceGetDecoderUtilization(handle) | |
except pynvml.NVMLError: | |
device_info['encoder_stats'] = "Not supported" | |
device_info['decoder_stats'] = "Not supported" | |
metrics.append(device_info) | |
system_info = { | |
'driver_version': driver_version, | |
'nvml_version': pynvml.nvmlSystemGetNVMLVersion(), | |
'cuda_driver_version': pynvml.nvmlSystemGetCudaDriverVersion(), | |
} | |
pynvml.nvmlShutdown() | |
return { | |
'system': system_info, | |
'devices': metrics | |
} | |
except Exception as e: | |
return f"Error getting NVML metrics: {str(e)}" | |
def get_node_exporter_metrics(): | |
try: | |
# Assuming node_exporter is running on default port 9100 | |
result = subprocess.run(['curl', 'http://localhost:9100/metrics'], | |
capture_output=True, text=True) | |
return result.stdout | |
except Exception as e: | |
return f"Error getting Node Exporter metrics: {str(e)}" | |
def get_dcgm_metrics(): | |
try: | |
# Assuming dcgm-exporter is running on default port 9400 | |
result = subprocess.run(['curl', 'http://localhost:9400/metrics'], | |
capture_output=True, text=True) | |
return result.stdout | |
except Exception as e: | |
return f"Error getting DCGM metrics: {str(e)}" | |
def get_nfsstat_metrics(): | |
try: | |
result = subprocess.run(['nfsstat', '-l'], capture_output=True, text=True) | |
return result.stdout | |
except Exception as e: | |
return f"Error getting NFS stats: {str(e)}" | |
def get_iostat_metrics(): | |
try: | |
result = subprocess.run(['iostat', '-x'], capture_output=True, text=True) | |
return result.stdout | |
except Exception as e: | |
return f"Error getting IO stats: {str(e)}" | |
def get_memory_metrics(): | |
try: | |
# Memory info | |
with open('/proc/meminfo', 'r') as f: | |
meminfo = f.read() | |
# Page fault info | |
with open('/proc/vmstat', 'r') as f: | |
vmstat = f.read() | |
return { | |
'meminfo': meminfo, | |
'vmstat': vmstat | |
} | |
except Exception as e: | |
return f"Error getting memory metrics: {str(e)}" | |
def collect_metrics(logger): | |
# Collect all metrics | |
metrics = { | |
'timestamp': datetime.now().isoformat(), | |
'nvml': get_nvml_metrics(), | |
#'node_exporter': get_node_exporter_metrics(), | |
#'dcgm': get_dcgm_metrics(), | |
'nfsstat': get_nfsstat_metrics(), | |
'iostat': get_iostat_metrics(), | |
'memory': get_memory_metrics() | |
} | |
# Log each metric section separately | |
for metric_name, metric_data in metrics.items(): | |
logger.info(f"=== {metric_name} ===") | |
logger.info(metric_data) | |
logger.info("\n") | |
def main(): | |
logger = setup_logging() | |
interval = 60 # Collection interval in seconds | |
logger.info("Starting metrics collection...") | |
try: | |
while True: | |
collect_metrics(logger) | |
time.sleep(interval) | |
except KeyboardInterrupt: | |
logger.info("Metrics collection stopped by user") | |
except Exception as e: | |
logger.error(f"Error in main loop: {str(e)}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment