Created
August 8, 2019 14:38
-
-
Save panisson/f2ad2712c332d99816d83e9acd2acd05 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- coding: utf-8 -*- | |
# check_gpu_sensor: Nagios/Icinga plugin to check GPU sensors | |
# | |
# Copyright (C) 2019 Andre Panisson (Python2 script), | |
# Copyright (C) 2011-2013 Thomas-Krenn.AG (Perl script), | |
# originally released at https://github.com/thomas-krenn/check_gpu_sensor_v1 | |
# | |
# This program is free software; you can redistribute it and/or modify it under | |
# the terms of the GNU General Public License as published by the Free Software | |
# Foundation; either version 3 of the License, or (at your option) any later | |
# version. | |
# | |
# This program is distributed in the hope that it will be useful, but WITHOUT | |
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | |
# details. | |
# | |
# You should have received a copy of the GNU General Public License along with | |
# this program; if not, see <http://www.gnu.org/licenses/>. | |
# | |
############################################################################### | |
# The following guides provide helpful information if you want to extend this | |
# script: | |
# http://nagiosplug.sourceforge.net/developer-guidelines.html (plug-in | |
# development guidelines) | |
############################################################################### | |
import re | |
import argparse | |
from argparse import RawTextHelpFormatter | |
from pynvml import * | |
DEVICE_LIST = [] # Array of GPUs in current system | |
# warning and critical default threshold levels | |
PERF_THRESHOLDS = dict( | |
GPUTemperature=['85', '100'], # Temperature | |
usedMemory=['95', '99'], # Memory utilizaion | |
fanSpeed=['80', '95'], # Fan speed | |
# only single ecc errors are configurable | |
# double ecc errors are treated as discrete sensors and issue | |
# a critical status | |
ECCMemAggSgl=['1', '2'], # Dev memory ecc errors | |
ECCL1AggSgl=['1', '2'], # L1 cache ecc errors | |
ECCL2AggSgl=['1', '2'], # L2 cache ecc errors | |
ECCRegAggSgl=['1', '2'], # Dev register ecc errors | |
ECCTexAggSgl=['1', '2'], # Dev texture cache ecc errors | |
PWRUsage=['150', '200'], # Power Usage | |
PCIeLinkGen=['2'], # PCIe link generation | |
PCIeLinkWidth=['16'], # PCIe link width | |
) | |
PERF_UNITS = dict( | |
GPUTemperature='degrees', | |
usedMemory='%' | |
) | |
############################################### | |
# Plugin specific functions | |
# They return help messages and version information | |
############################################### | |
def get_version(): | |
return """check_gpu_sensor version 1.0 20190908 | |
Based on Perl version 2.3 20130610 | |
Copyright (C) 2019 Andre Panisson | |
Copyright (C) 2011-2013 Thomas-Krenn.AG (written by Georg Schönberger) | |
Your system is using NVIDIA driver version %s with | |
NVML version %s""" % (get_driver_version(), get_nvml_version()) | |
def get_options(): | |
# Parse command line options | |
parser = argparse.ArgumentParser( | |
prog='check_gpu_sensor', | |
description="Nagios/Icinga plugin to check GPU sensors via NVML", | |
formatter_class=RawTextHelpFormatter) | |
parser.add_argument('-d', '--device', default=-1, | |
type=int, action='store', | |
metavar='<device id>', help="""\ | |
Return information about the GPU with the given device ID. IDs | |
can be checked with the nvidia-smi tool. Attention: It is not | |
ensured that device IDs are persistent across reboots. The safest | |
way is to define a device bus string via '-db' which can be found | |
with nvidia-smi.""") | |
parser.add_argument('-db', '--device-bus', default='', | |
type=str, action='store', | |
metavar='<pci bus string>', help="""\ | |
Check the GPU with the corresponding pci bus device string. The pci bus | |
ID can be found out with: | |
nvidia-smi -a |grep 'Bus Id'. | |
An example string is '0000:01:00.0' that can be used to call the plugin: | |
./check_gpu_sensor -db '0000:01:00.0'""") | |
parser.add_argument('-T', '--sensors', default='', | |
type=str, action='store', | |
metavar='<sensor type>', help="""\ | |
limit sensors to query based on NVML sensor types. | |
The sensors are currently working for performance data sensors. | |
Examples for GPU sensor types are 'GPUTemperature', | |
'usedMemory','fanSpeed'""") | |
parser.add_argument('-w', '--warning', default='', | |
type=str, action='store', | |
metavar='<list of warning thresholds>', help="""\ | |
Change the default warning levels (also consider to use a config file | |
instead ('-cf'). The order of the levels | |
is the following: | |
-GPUTemperature | |
-usedMemory | |
-fanSpeed | |
-ECCMemAggSgl | |
-ECCL1AggSgl | |
-ECCL2AggSgl | |
-ECCRegAggSgl | |
-ECCTexAggSgl | |
-PWRUsage | |
Levels that should stay default get a 'd' assigned. | |
Example: | |
check_gpu_sensor -w '75,d,d,d,d,d,d,d,d' | |
This changes the warning level for the temperature.""") | |
parser.add_argument('-c', '--critical', default='', | |
type=str, action='store', | |
metavar='<list of critical thresholds>', help="""\ | |
Change the default critical levels. The order of the levels | |
is the same as for the warning levels, moreover two more items | |
can be listed: | |
-PCIeLinkGen | |
-PCIeLinkWidth | |
Levels that should stay default get a 'd' assigned. | |
Example: | |
check_gpu_sensor -c '100,d,d,d,d,d,d,d,d,3,16' | |
This changes the critical level for the temperature, the PCIe link | |
generation to '3' and the link width to '16'.""") | |
parser.add_argument('-v', '--verbose', action='count', | |
help="""\ | |
be verbose | |
(no -v) .. single line output | |
-v ..... single line output with additional details for warnings | |
-vv ..... multi line output, also with additional details for warnings | |
-vvv ..... normal output, then debugging output, followed by normal | |
multi line output""") | |
parser.add_argument('-V', help="show version information", | |
action="store_true") | |
return parser | |
############################################### | |
# Helper functions | |
# They check for errors and print several structs | |
# They also generate status outputs and verbose information | |
############################################### | |
############################################### | |
# System specific functions | |
# They are used to collect information about the current system | |
############################################### | |
def get_nvml_version(): | |
# Working since 3.295.41 | |
version = get_driver_version() | |
version, _ = re.findall(r"(\d+)\.(\d+)", version)[0] | |
if(version >= 295): | |
version = nvmlSystemGetNVMLVersion() | |
return version | |
else: | |
return "not yet supported" | |
def get_driver_version(): | |
try: | |
version = nvmlSystemGetDriverVersion() | |
except NVMLError as error: | |
print("Error: " + error + ".\n") | |
exit(3) | |
return version | |
def get_device_count(): | |
try: | |
count = nvmlDeviceGetCount() | |
except NVMLError as error: | |
print("Error: " + error + ".\n") | |
exit(3) | |
return count | |
def get_device_memory(deviceHandle): | |
try: | |
memory_info = nvmlDeviceGetMemoryInfo(deviceHandle) | |
used_memory = 100 * memory_info.used / memory_info.total | |
return used_memory | |
except NVMLError as error: | |
print("Error: Cannot get memory info for device: " + error) | |
exit(3) | |
def get_device_status(current_device): | |
deviceHandle = current_device['deviceHandle'] | |
current_device['productName'] = nvmlDeviceGetName(deviceHandle) | |
current_device['deviceComputeMode'] = nvmlDeviceGetComputeMode(deviceHandle) | |
current_device['fanSpeed'] = nvmlDeviceGetFanSpeed(deviceHandle) | |
current_device['GPUTemperature'] = nvmlDeviceGetTemperature(deviceHandle, NVML_TEMPERATURE_GPU) | |
current_device['devicePciInfo'] = nvmlDeviceGetPciInfo(deviceHandle) | |
current_device['usedMemory'] = get_device_memory(deviceHandle) | |
# TODO: implement monitoring of other sensors | |
# current_device['utilizationRates'] = get_device_util(deviceHandle) | |
# current_device['nvmlClockInfo'] = get_device_clock(deviceHandle) | |
# current_device['nvmlDeviceInforom'] = get_device_inforom(deviceHandle) | |
# current_device['nvmlDeviceEccInfos'] = get_device_ecc(deviceHandle) | |
# current_device['nvmlDevicePowerInfos'] = get_device_power(deviceHandle) | |
# current_device['persistenceMode'] = get_persistence_mode(deviceHandle) | |
# current_device['inforomValid'] = get_inforom_validation(deviceHandle) | |
# current_device['throttleReasons'] = get_throttle_reasons(deviceHandle) | |
# current_device['PCIeLink'] = get_pcie_link(deviceHandle) | |
return current_device | |
############################################### | |
# Overall device functions | |
# They collect functions for a GPU in the current system | |
############################################### | |
def get_all_device_status(device_id, device_bus): | |
count = get_device_count() | |
if count == 0: | |
print("Error: No NVIDIA device found in current system.") | |
exit(3) | |
if device_bus != '': | |
try: | |
handle = nvmlDeviceGetHandleByPciBusId(device_bus) | |
except NVMLError as error: | |
print("Error: Cannot get handle for device bus ID: " + error) | |
return "NOK" | |
else: | |
if device_id != -1: | |
try: | |
handle = nvmlDeviceGetHandleByIndex(device_id) | |
except NVMLError as error: | |
print("Error: Cannot get handle for device bus ID: " + error) | |
return "NOK" | |
gpu_h = {} | |
if device_id != -1: | |
gpu_h['deviceID'] = device_id | |
if device_bus != '': | |
gpu_h['devicePCIBusID'] = device_bus | |
gpu_h['deviceHandle'] = handle | |
# fetching gpu status | |
gpu_h = get_device_status(gpu_h) | |
# DEVICE_LIST.append(gpu_ref) | |
return gpu_h | |
# collects the perf data (only numeric values) | |
def collect_perf_data(DEVICE_LIST, sensor_list_ref): | |
sensor_list = () | |
perf_data = [] | |
for device in DEVICE_LIST: | |
# fetch the desired sensors | |
if(sensor_list_ref): | |
sensor_list = sensor_list_ref.split(',') | |
else: | |
# if no sensor is given via -T, we dump all | |
sensor_list = device.keys() | |
dev_perf_data = {} | |
for k in sensor_list: | |
dev_perf_data[k] = device[k] | |
perf_data.append(dev_perf_data) | |
return perf_data | |
# checks if the given performance data is in its rangens | |
def check_perf_threshold(perf_data, warn_list, crit_list): | |
status_level = ["OK"] | |
warn_level = [] # warning sensors | |
crit_level = [] # crit sensors | |
if warn_list: | |
for i in range(len(warn_list)): | |
# everything, except that values that should stay default, get new values | |
# e.g. -w d,15,60 changes the warning level for sensor 2 and 3 but not for 1 | |
if warn_list[i] != 'd': | |
if (i == 0): | |
PERF_THRESHOLDS['GPUTemperature'][0] = warn_list[i] | |
elif (i == 1): | |
PERF_THRESHOLDS['usedMemory'][0] = warn_list[i] | |
elif (i == 2): | |
PERF_THRESHOLDS['fanSpeed'][0] = warn_list[i] | |
elif (i == 3): | |
PERF_THRESHOLDS['ECCMemAggSgl'][0] = warn_list[i] | |
elif (i == 4): | |
PERF_THRESHOLDS['ECCL1AggSgl'][0] = warn_list[i] | |
elif (i == 5): | |
PERF_THRESHOLDS['ECCL2AggSgl'][0] = warn_list[i] | |
elif (i == 6): | |
PERF_THRESHOLDS['ECCRegAggSgl'][0] = warn_list[i] | |
elif (i == 7): | |
PERF_THRESHOLDS['ECCTexAggSgl'][0] = warn_list[i] | |
elif (i == 8): | |
PERF_THRESHOLDS['PWRUsage'][0] = warn_list[i] | |
if crit_list: | |
for i in range(len(crit_list)): | |
if crit_list[i] != 'd': | |
if (i == 0): | |
PERF_THRESHOLDS['GPUTemperature'][1] = crit_list[i] | |
elif(i == 1): | |
PERF_THRESHOLDS['usedMemory'][1] = crit_list[i] | |
elif(i == 2): | |
PERF_THRESHOLDS['fanSpeed'][1] = crit_list[i] | |
elif(i == 3): | |
PERF_THRESHOLDS['ECCMemAggSgl'][1] = crit_list[i] | |
elif(i == 4): | |
PERF_THRESHOLDS['ECCL1AggSgl'][1] = crit_list[i] | |
elif(i == 5): | |
PERF_THRESHOLDS['ECCL2AggSgl'][1] = crit_list[i] | |
elif(i == 6): | |
PERF_THRESHOLDS['ECCRegAggSgl'][1] = crit_list[i] | |
elif(i == 7): | |
PERF_THRESHOLDS['ECCTexAggSgl'][1] = crit_list[i] | |
elif(i == 8): | |
PERF_THRESHOLDS['PWRUsage'][1] = crit_list[i] | |
# configure thresholds here, but the sensors are treated as discre | |
elif(i == 9): | |
PERF_THRESHOLDS['PCIeLinkGen'][0] = crit_list[i] | |
elif(i == 10): | |
PERF_THRESHOLDS['PCIeLinkWidth'][0] = crit_list[i] | |
# fetch the perfdata of the gpu | |
for k in perf_data.keys(): | |
if k in PERF_THRESHOLDS: | |
# warning level | |
if perf_data[k] >= PERF_THRESHOLDS[k][0]: | |
status_level[0] = "WARNING" | |
warn_level.append(k) | |
# critival level | |
if perf_data[k] >= PERF_THRESHOLDS[k][1]: | |
status_level[0] = "CRITICAL" | |
warn_level.pop() # as it is critical, remove it from warning | |
crit_level.append(k) | |
status_level.append(warn_level) | |
status_level.append(crit_level) | |
return status_level | |
def get_status_string(level, perf_data, curr_sensors, verbosity): | |
status_string = "" | |
# Collect performance data of warn and crit sensors | |
for sensor in curr_sensors: | |
status_string += "[" + sensor + ": " + level | |
if verbosity and sensor in perf_data: | |
status_string += " (" + str(perf_data[sensor]) | |
if sensor in PERF_UNITS: | |
status_string += " " + PERF_UNITS[sensor] | |
status_string += ")" | |
status_string += "]" | |
return status_string | |
def get_perf_string(curr_sensors): | |
status_string = "" | |
i = 1 | |
# Collect performance values followed by thresholds | |
for k in curr_sensors.keys(): | |
status_string += k + "=" + str(curr_sensors[k]) | |
# print warn and crit thresholds | |
if k in PERF_THRESHOLDS: | |
status_string += ";" + str(PERF_THRESHOLDS[k][0]) | |
status_string += ";" + str(PERF_THRESHOLDS[k][1]) | |
if i != len(curr_sensors): | |
status_string += " " | |
i += 1 | |
return status_string | |
############################################### | |
# Main function | |
# Command line processing and device status collection | |
############################################### | |
def main(): | |
# Initialize nvml library | |
try: | |
nvmlInit() | |
except NVMLError as error: | |
print("Debug: NVML initialization failed.") | |
print("Error: " + error) | |
exit(3) | |
parser = get_options() | |
args = parser.parse_args() | |
if args.V: | |
print(get_version()) | |
exit(0) | |
# the desired gpu device to query | |
device_id = args.device | |
# device bus information | |
device_bus = args.device_bus | |
# query a specific sensor | |
sensor_list = args.sensors | |
# change thresholds for performance data | |
warn_threshold = args.warning | |
try: | |
warn_threshold = [(int(t) if t != 'd' else t) | |
for t in warn_threshold.split(',') | |
if t != ''] | |
except ValueError: | |
print('Invalid parameter value for warning: %s' % warn_threshold) | |
exit(3) | |
crit_threshold = args.critical | |
try: | |
crit_threshold = [(int(t) if t != 'd' else t) | |
for t in crit_threshold.split(',') | |
if t != ''] | |
except ValueError: | |
print('Invalid parameter value for critical: %s' % crit_threshold) | |
exit(3) | |
verbosity = args.verbose | |
# the device ID is not present | |
if device_id == -1 and device_bus == '': | |
print "Error: Valid PCI bus string or device ID is required." | |
print get_usage() | |
exit(3) | |
# Collect the informations about the device in the system | |
gpu_ref = get_all_device_status(device_id, device_bus) | |
if gpu_ref == "NOK": | |
print "Ensure to use a valid device id or device bus string." | |
exit(3) | |
DEVICE_LIST.append(gpu_ref) | |
perf_data = collect_perf_data(DEVICE_LIST, sensor_list) | |
status_level = check_perf_threshold(perf_data[0], | |
warn_threshold, crit_threshold) | |
# TODO: status_level = check_discrete_sensors(status_level); | |
# check return values of threshold and discrete sensor function | |
EXIT_CODE = 0 # Ok | |
if status_level[0] == "WARNING": | |
EXIT_CODE = 1 # Warning | |
if status_level[0] == "CRITICAL": | |
EXIT_CODE = 2 # Critical | |
status_string = "GPU " + status_level[0] | |
status_string += " - " + DEVICE_LIST[0]['productName'] + " " | |
status_string += get_status_string("Critical", perf_data[0], | |
status_level[2], verbosity) | |
status_string += get_status_string("Warning", perf_data[0], | |
status_level[1], verbosity) | |
status_string += "|" | |
status_string += get_perf_string(perf_data[0]) | |
# TODO: print "\n".get_verbose_string(verbosity,DEVICE_LIST[0],show_na); | |
print(status_string) | |
# shutdown nvml library | |
try: | |
nvmlShutdown() | |
except NVMLError as error: | |
print "Debug: NVML shutdown failed." | |
print "Error: " + error | |
exit(3) | |
exit(EXIT_CODE) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment