Quentin-Anthony · February 4, 2025 18:46
diff --git a/metrics_collector.py b/metrics_collector.py
 #!/usr/bin/env python3

 import subprocess
 import time
 import logging
 from datetime import datetime
 import pynvml
 import os

 # Configure logging
 def setup_logging():
    log_dir = "metrics_logs"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f"{log_dir}/metrics_{timestamp}.log"

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)


 def get_nvml_metrics():
    try:
        pynvml.nvmlInit()
        device_count = pynvml.nvmlDeviceGetCount()
        metrics = []

        # Get driver version
        driver_version = pynvml.nvmlSystemGetDriverVersion()

        for i in range(device_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)

            # Basic device info
            device_info = {
                'device_index': i,
                'device_name': pynvml.nvmlDeviceGetName(handle),
                'serial': pynvml.nvmlDeviceGetSerial(handle),
                'uuid': pynvml.nvmlDeviceGetUUID(handle),
                'pci_info': {
                    'bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busId,
                    'domain': pynvml.nvmlDeviceGetPciInfo(handle).domain,
                    'device_id': pynvml.nvmlDeviceGetPciInfo(handle).device,
                    'bus': pynvml.nvmlDeviceGetPciInfo(handle).bus,
                    'pci_bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busIdLegacy,
                }
            }

            # Memory info
            memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            device_info['memory'] = {
                'total': memory_info.total,
                'free': memory_info.free,
                'used': memory_info.used,
            }

            # Utilization rates
            utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
            device_info['utilization'] = {
                'gpu': utilization.gpu,
                'memory': utilization.memory
            }

            # Power usage
            try:
                device_info['power'] = {
                    'usage': pynvml.nvmlDeviceGetPowerUsage(handle),  # in milliwatts
                    'limit': pynvml.nvmlDeviceGetEnforcedPowerLimit(handle),  # in milliwatts
                }
            except pynvml.NVMLError:
                device_info['power'] = "Not supported"

            # Temperature
            try:
                device_info['temperature'] = {
                    'gpu': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU),  # in Celsius
                    #'memory': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_MEMORY),
                    'threshold': pynvml.nvmlDeviceGetTemperatureThreshold(handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN),
                }
            except pynvml.NVMLError:
                device_info['temperature'] = "Not supported"

            # Fan speed
            try:
                device_info['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle)  # percentage
            except pynvml.NVMLError:
                device_info['fan_speed'] = "Not supported"

            # Clock speeds
            try:
                device_info['clocks'] = {
                    'graphics': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS),  # in MHz
                    'sm': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM),
                    'memory': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM),
                    'max_graphics': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS),
                    'max_sm': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM),
                    'max_memory': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_MEM),
                }
            except pynvml.NVMLError:
                device_info['clocks'] = "Not supported"

            # Performance state
            try:
                device_info['performance_state'] = pynvml.nvmlDeviceGetPerformanceState(handle)
            except pynvml.NVMLError:
                device_info['performance_state'] = "Not supported"

            # Compute mode
            try:
                device_info['compute_mode'] = pynvml.nvmlDeviceGetComputeMode(handle)
            except pynvml.NVMLError:
                device_info['compute_mode'] = "Not supported"

            # ECC errors (if supported)
            try:
                device_info['ecc_errors'] = {
                    'l1_cache': {
                        'volatile': {
                            'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE),
                            'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE),
                        },
                    },
                    'l2_cache': {
                        'volatile': {
                            'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE),
                            'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE),
                        },
                    },
                    'device_memory': {
                        'volatile': {
                            'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY),
                            'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY),
                        },
                    },
                }
            except pynvml.NVMLError:
                device_info['ecc_errors'] = "Not supported"

            # Processes
            try:
                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                device_info['processes'] = [{
                    'pid': p.pid,
                    'used_memory': p.usedGpuMemory
                } for p in processes]
            except pynvml.NVMLError:
                device_info['processes'] = "Not supported"

            # PCIe throughput
            try:
                device_info['pcie_throughput'] = {
                    'tx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_TX_BYTES),  # in KB/s
                    'rx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_RX_BYTES),  # in KB/s
                }
            except pynvml.NVMLError:
                device_info['pcie_throughput'] = "Not supported"

            # Encoder/Decoder utilization
            try:
                device_info['encoder_stats'] = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                device_info['decoder_stats'] = pynvml.nvmlDeviceGetDecoderUtilization(handle)
            except pynvml.NVMLError:
                device_info['encoder_stats'] = "Not supported"
                device_info['decoder_stats'] = "Not supported"

            metrics.append(device_info)

        system_info = {
            'driver_version': driver_version,
            'nvml_version': pynvml.nvmlSystemGetNVMLVersion(),
            'cuda_driver_version': pynvml.nvmlSystemGetCudaDriverVersion(),
        }

        pynvml.nvmlShutdown()
        return {
            'system': system_info,
            'devices': metrics
        }
    except Exception as e:
        return f"Error getting NVML metrics: {str(e)}"


 def get_node_exporter_metrics():
    try:
        # Assuming node_exporter is running on default port 9100
        result = subprocess.run(['curl', 'http://localhost:9100/metrics'],
                              capture_output=True, text=True)
        return result.stdout
    except Exception as e:
        return f"Error getting Node Exporter metrics: {str(e)}"

 def get_dcgm_metrics():
    try:
        # Assuming dcgm-exporter is running on default port 9400
        result = subprocess.run(['curl', 'http://localhost:9400/metrics'],
                              capture_output=True, text=True)
        return result.stdout
    except Exception as e:
        return f"Error getting DCGM metrics: {str(e)}"

 def get_nfsstat_metrics():
    try:
        result = subprocess.run(['nfsstat', '-l'], capture_output=True, text=True)
        return result.stdout
    except Exception as e:
        return f"Error getting NFS stats: {str(e)}"

 def get_iostat_metrics():
    try:
        result = subprocess.run(['iostat', '-x'], capture_output=True, text=True)
        return result.stdout
    except Exception as e:
        return f"Error getting IO stats: {str(e)}"

 def get_memory_metrics():
    try:
        # Memory info
        with open('/proc/meminfo', 'r') as f:
            meminfo = f.read()

        # Page fault info
        with open('/proc/vmstat', 'r') as f:
            vmstat = f.read()

        return {
            'meminfo': meminfo,
            'vmstat': vmstat
        }
    except Exception as e:
        return f"Error getting memory metrics: {str(e)}"

 def collect_metrics(logger):
    # Collect all metrics
    metrics = {
        'timestamp': datetime.now().isoformat(),
        'nvml': get_nvml_metrics(),
        #'node_exporter': get_node_exporter_metrics(),
        #'dcgm': get_dcgm_metrics(),
        'nfsstat': get_nfsstat_metrics(),
        'iostat': get_iostat_metrics(),
        'memory': get_memory_metrics()
    }

    # Log each metric section separately
    for metric_name, metric_data in metrics.items():
        logger.info(f"=== {metric_name} ===")
        logger.info(metric_data)
        logger.info("\n")

 def main():
    logger = setup_logging()
    interval = 60  # Collection interval in seconds

    logger.info("Starting metrics collection...")

    try:
        while True:
            collect_metrics(logger)
            time.sleep(interval)
    except KeyboardInterrupt:
        logger.info("Metrics collection stopped by user")
    except Exception as e:
        logger.error(f"Error in main loop: {str(e)}")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import subprocess
	import time
	import logging
	from datetime import datetime
	import pynvml
	import os

	# Configure logging
	def setup_logging():
	log_dir = "metrics_logs"
	if not os.path.exists(log_dir):
	os.makedirs(log_dir)

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	log_file = f"{log_dir}/metrics_{timestamp}.log"

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(message)s',
	handlers=[
	logging.FileHandler(log_file),
	logging.StreamHandler()
	]
	)
	return logging.getLogger(__name__)


	def get_nvml_metrics():
	try:
	pynvml.nvmlInit()
	device_count = pynvml.nvmlDeviceGetCount()
	metrics = []

	# Get driver version
	driver_version = pynvml.nvmlSystemGetDriverVersion()

	for i in range(device_count):
	handle = pynvml.nvmlDeviceGetHandleByIndex(i)

	# Basic device info
	device_info = {
	'device_index': i,
	'device_name': pynvml.nvmlDeviceGetName(handle),
	'serial': pynvml.nvmlDeviceGetSerial(handle),
	'uuid': pynvml.nvmlDeviceGetUUID(handle),
	'pci_info': {
	'bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busId,
	'domain': pynvml.nvmlDeviceGetPciInfo(handle).domain,
	'device_id': pynvml.nvmlDeviceGetPciInfo(handle).device,
	'bus': pynvml.nvmlDeviceGetPciInfo(handle).bus,
	'pci_bus_id': pynvml.nvmlDeviceGetPciInfo(handle).busIdLegacy,
	}
	}

	# Memory info
	memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
	device_info['memory'] = {
	'total': memory_info.total,
	'free': memory_info.free,
	'used': memory_info.used,
	}

	# Utilization rates
	utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
	device_info['utilization'] = {
	'gpu': utilization.gpu,
	'memory': utilization.memory
	}

	# Power usage
	try:
	device_info['power'] = {
	'usage': pynvml.nvmlDeviceGetPowerUsage(handle), # in milliwatts
	'limit': pynvml.nvmlDeviceGetEnforcedPowerLimit(handle), # in milliwatts
	}
	except pynvml.NVMLError:
	device_info['power'] = "Not supported"

	# Temperature
	try:
	device_info['temperature'] = {
	'gpu': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU), # in Celsius
	#'memory': pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_MEMORY),
	'threshold': pynvml.nvmlDeviceGetTemperatureThreshold(handle, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN),
	}
	except pynvml.NVMLError:
	device_info['temperature'] = "Not supported"

	# Fan speed
	try:
	device_info['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle) # percentage
	except pynvml.NVMLError:
	device_info['fan_speed'] = "Not supported"

	# Clock speeds
	try:
	device_info['clocks'] = {
	'graphics': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS), # in MHz
	'sm': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM),
	'memory': pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM),
	'max_graphics': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS),
	'max_sm': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM),
	'max_memory': pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_MEM),
	}
	except pynvml.NVMLError:
	device_info['clocks'] = "Not supported"

	# Performance state
	try:
	device_info['performance_state'] = pynvml.nvmlDeviceGetPerformanceState(handle)
	except pynvml.NVMLError:
	device_info['performance_state'] = "Not supported"

	# Compute mode
	try:
	device_info['compute_mode'] = pynvml.nvmlDeviceGetComputeMode(handle)
	except pynvml.NVMLError:
	device_info['compute_mode'] = "Not supported"

	# ECC errors (if supported)
	try:
	device_info['ecc_errors'] = {
	'l1_cache': {
	'volatile': {
	'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE),
	'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L1_CACHE),
	},
	},
	'l2_cache': {
	'volatile': {
	'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE),
	'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_L2_CACHE),
	},
	},
	'device_memory': {
	'volatile': {
	'single': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY),
	'double': pynvml.nvmlDeviceGetMemoryErrorCounter(handle, pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, pynvml.NVML_VOLATILE_ECC, pynvml.NVML_MEMORY_LOCATION_DEVICE_MEMORY),
	},
	},
	}
	except pynvml.NVMLError:
	device_info['ecc_errors'] = "Not supported"

	# Processes
	try:
	processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
	device_info['processes'] = [{
	'pid': p.pid,
	'used_memory': p.usedGpuMemory
	} for p in processes]
	except pynvml.NVMLError:
	device_info['processes'] = "Not supported"

	# PCIe throughput
	try:
	device_info['pcie_throughput'] = {
	'tx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_TX_BYTES), # in KB/s
	'rx_bytes': pynvml.nvmlDeviceGetPcieThroughput(handle, pynvml.NVML_PCIE_UTIL_RX_BYTES), # in KB/s
	}
	except pynvml.NVMLError:
	device_info['pcie_throughput'] = "Not supported"

	# Encoder/Decoder utilization
	try:
	device_info['encoder_stats'] = pynvml.nvmlDeviceGetEncoderUtilization(handle)
	device_info['decoder_stats'] = pynvml.nvmlDeviceGetDecoderUtilization(handle)
	except pynvml.NVMLError:
	device_info['encoder_stats'] = "Not supported"
	device_info['decoder_stats'] = "Not supported"

	metrics.append(device_info)

	system_info = {
	'driver_version': driver_version,
	'nvml_version': pynvml.nvmlSystemGetNVMLVersion(),
	'cuda_driver_version': pynvml.nvmlSystemGetCudaDriverVersion(),
	}

	pynvml.nvmlShutdown()
	return {
	'system': system_info,
	'devices': metrics
	}
	except Exception as e:
	return f"Error getting NVML metrics: {str(e)}"


	def get_node_exporter_metrics():
	try:
	# Assuming node_exporter is running on default port 9100
	result = subprocess.run(['curl', 'http://localhost:9100/metrics'],
	capture_output=True, text=True)
	return result.stdout
	except Exception as e:
	return f"Error getting Node Exporter metrics: {str(e)}"

	def get_dcgm_metrics():
	try:
	# Assuming dcgm-exporter is running on default port 9400
	result = subprocess.run(['curl', 'http://localhost:9400/metrics'],
	capture_output=True, text=True)
	return result.stdout
	except Exception as e:
	return f"Error getting DCGM metrics: {str(e)}"

	def get_nfsstat_metrics():
	try:
	result = subprocess.run(['nfsstat', '-l'], capture_output=True, text=True)
	return result.stdout
	except Exception as e:
	return f"Error getting NFS stats: {str(e)}"

	def get_iostat_metrics():
	try:
	result = subprocess.run(['iostat', '-x'], capture_output=True, text=True)
	return result.stdout
	except Exception as e:
	return f"Error getting IO stats: {str(e)}"

	def get_memory_metrics():
	try:
	# Memory info
	with open('/proc/meminfo', 'r') as f:
	meminfo = f.read()

	# Page fault info
	with open('/proc/vmstat', 'r') as f:
	vmstat = f.read()

	return {
	'meminfo': meminfo,
	'vmstat': vmstat
	}
	except Exception as e:
	return f"Error getting memory metrics: {str(e)}"

	def collect_metrics(logger):
	# Collect all metrics
	metrics = {
	'timestamp': datetime.now().isoformat(),
	'nvml': get_nvml_metrics(),
	#'node_exporter': get_node_exporter_metrics(),
	#'dcgm': get_dcgm_metrics(),
	'nfsstat': get_nfsstat_metrics(),
	'iostat': get_iostat_metrics(),
	'memory': get_memory_metrics()
	}

	# Log each metric section separately
	for metric_name, metric_data in metrics.items():
	logger.info(f"=== {metric_name} ===")
	logger.info(metric_data)
	logger.info("\n")

	def main():
	logger = setup_logging()
	interval = 60 # Collection interval in seconds

	logger.info("Starting metrics collection...")

	try:
	while True:
	collect_metrics(logger)
	time.sleep(interval)
	except KeyboardInterrupt:
	logger.info("Metrics collection stopped by user")
	except Exception as e:
	logger.error(f"Error in main loop: {str(e)}")

	if __name__ == "__main__":
	main()