Skip to content

Instantly share code, notes, and snippets.

@panisson
Created August 8, 2019 14:38
Show Gist options
  • Save panisson/f2ad2712c332d99816d83e9acd2acd05 to your computer and use it in GitHub Desktop.
Save panisson/f2ad2712c332d99816d83e9acd2acd05 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# check_gpu_sensor: Nagios/Icinga plugin to check GPU sensors
#
# Copyright (C) 2019 Andre Panisson (Python2 script),
# Copyright (C) 2011-2013 Thomas-Krenn.AG (Perl script),
# originally released at https://github.com/thomas-krenn/check_gpu_sensor_v1
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, see <http://www.gnu.org/licenses/>.
#
###############################################################################
# The following guides provide helpful information if you want to extend this
# script:
# http://nagiosplug.sourceforge.net/developer-guidelines.html (plug-in
# development guidelines)
###############################################################################
import re
import argparse
from argparse import RawTextHelpFormatter
from pynvml import *
DEVICE_LIST = [] # Array of GPUs in current system
# warning and critical default threshold levels
PERF_THRESHOLDS = dict(
GPUTemperature=['85', '100'], # Temperature
usedMemory=['95', '99'], # Memory utilizaion
fanSpeed=['80', '95'], # Fan speed
# only single ecc errors are configurable
# double ecc errors are treated as discrete sensors and issue
# a critical status
ECCMemAggSgl=['1', '2'], # Dev memory ecc errors
ECCL1AggSgl=['1', '2'], # L1 cache ecc errors
ECCL2AggSgl=['1', '2'], # L2 cache ecc errors
ECCRegAggSgl=['1', '2'], # Dev register ecc errors
ECCTexAggSgl=['1', '2'], # Dev texture cache ecc errors
PWRUsage=['150', '200'], # Power Usage
PCIeLinkGen=['2'], # PCIe link generation
PCIeLinkWidth=['16'], # PCIe link width
)
PERF_UNITS = dict(
GPUTemperature='degrees',
usedMemory='%'
)
###############################################
# Plugin specific functions
# They return help messages and version information
###############################################
def get_version():
return """check_gpu_sensor version 1.0 20190908
Based on Perl version 2.3 20130610
Copyright (C) 2019 Andre Panisson
Copyright (C) 2011-2013 Thomas-Krenn.AG (written by Georg Schönberger)
Your system is using NVIDIA driver version %s with
NVML version %s""" % (get_driver_version(), get_nvml_version())
def get_options():
# Parse command line options
parser = argparse.ArgumentParser(
prog='check_gpu_sensor',
description="Nagios/Icinga plugin to check GPU sensors via NVML",
formatter_class=RawTextHelpFormatter)
parser.add_argument('-d', '--device', default=-1,
type=int, action='store',
metavar='<device id>', help="""\
Return information about the GPU with the given device ID. IDs
can be checked with the nvidia-smi tool. Attention: It is not
ensured that device IDs are persistent across reboots. The safest
way is to define a device bus string via '-db' which can be found
with nvidia-smi.""")
parser.add_argument('-db', '--device-bus', default='',
type=str, action='store',
metavar='<pci bus string>', help="""\
Check the GPU with the corresponding pci bus device string. The pci bus
ID can be found out with:
nvidia-smi -a |grep 'Bus Id'.
An example string is '0000:01:00.0' that can be used to call the plugin:
./check_gpu_sensor -db '0000:01:00.0'""")
parser.add_argument('-T', '--sensors', default='',
type=str, action='store',
metavar='<sensor type>', help="""\
limit sensors to query based on NVML sensor types.
The sensors are currently working for performance data sensors.
Examples for GPU sensor types are 'GPUTemperature',
'usedMemory','fanSpeed'""")
parser.add_argument('-w', '--warning', default='',
type=str, action='store',
metavar='<list of warning thresholds>', help="""\
Change the default warning levels (also consider to use a config file
instead ('-cf'). The order of the levels
is the following:
-GPUTemperature
-usedMemory
-fanSpeed
-ECCMemAggSgl
-ECCL1AggSgl
-ECCL2AggSgl
-ECCRegAggSgl
-ECCTexAggSgl
-PWRUsage
Levels that should stay default get a 'd' assigned.
Example:
check_gpu_sensor -w '75,d,d,d,d,d,d,d,d'
This changes the warning level for the temperature.""")
parser.add_argument('-c', '--critical', default='',
type=str, action='store',
metavar='<list of critical thresholds>', help="""\
Change the default critical levels. The order of the levels
is the same as for the warning levels, moreover two more items
can be listed:
-PCIeLinkGen
-PCIeLinkWidth
Levels that should stay default get a 'd' assigned.
Example:
check_gpu_sensor -c '100,d,d,d,d,d,d,d,d,3,16'
This changes the critical level for the temperature, the PCIe link
generation to '3' and the link width to '16'.""")
parser.add_argument('-v', '--verbose', action='count',
help="""\
be verbose
(no -v) .. single line output
-v ..... single line output with additional details for warnings
-vv ..... multi line output, also with additional details for warnings
-vvv ..... normal output, then debugging output, followed by normal
multi line output""")
parser.add_argument('-V', help="show version information",
action="store_true")
return parser
###############################################
# Helper functions
# They check for errors and print several structs
# They also generate status outputs and verbose information
###############################################
###############################################
# System specific functions
# They are used to collect information about the current system
###############################################
def get_nvml_version():
# Working since 3.295.41
version = get_driver_version()
version, _ = re.findall(r"(\d+)\.(\d+)", version)[0]
if(version >= 295):
version = nvmlSystemGetNVMLVersion()
return version
else:
return "not yet supported"
def get_driver_version():
try:
version = nvmlSystemGetDriverVersion()
except NVMLError as error:
print("Error: " + error + ".\n")
exit(3)
return version
def get_device_count():
try:
count = nvmlDeviceGetCount()
except NVMLError as error:
print("Error: " + error + ".\n")
exit(3)
return count
def get_device_memory(deviceHandle):
try:
memory_info = nvmlDeviceGetMemoryInfo(deviceHandle)
used_memory = 100 * memory_info.used / memory_info.total
return used_memory
except NVMLError as error:
print("Error: Cannot get memory info for device: " + error)
exit(3)
def get_device_status(current_device):
deviceHandle = current_device['deviceHandle']
current_device['productName'] = nvmlDeviceGetName(deviceHandle)
current_device['deviceComputeMode'] = nvmlDeviceGetComputeMode(deviceHandle)
current_device['fanSpeed'] = nvmlDeviceGetFanSpeed(deviceHandle)
current_device['GPUTemperature'] = nvmlDeviceGetTemperature(deviceHandle, NVML_TEMPERATURE_GPU)
current_device['devicePciInfo'] = nvmlDeviceGetPciInfo(deviceHandle)
current_device['usedMemory'] = get_device_memory(deviceHandle)
# TODO: implement monitoring of other sensors
# current_device['utilizationRates'] = get_device_util(deviceHandle)
# current_device['nvmlClockInfo'] = get_device_clock(deviceHandle)
# current_device['nvmlDeviceInforom'] = get_device_inforom(deviceHandle)
# current_device['nvmlDeviceEccInfos'] = get_device_ecc(deviceHandle)
# current_device['nvmlDevicePowerInfos'] = get_device_power(deviceHandle)
# current_device['persistenceMode'] = get_persistence_mode(deviceHandle)
# current_device['inforomValid'] = get_inforom_validation(deviceHandle)
# current_device['throttleReasons'] = get_throttle_reasons(deviceHandle)
# current_device['PCIeLink'] = get_pcie_link(deviceHandle)
return current_device
###############################################
# Overall device functions
# They collect functions for a GPU in the current system
###############################################
def get_all_device_status(device_id, device_bus):
count = get_device_count()
if count == 0:
print("Error: No NVIDIA device found in current system.")
exit(3)
if device_bus != '':
try:
handle = nvmlDeviceGetHandleByPciBusId(device_bus)
except NVMLError as error:
print("Error: Cannot get handle for device bus ID: " + error)
return "NOK"
else:
if device_id != -1:
try:
handle = nvmlDeviceGetHandleByIndex(device_id)
except NVMLError as error:
print("Error: Cannot get handle for device bus ID: " + error)
return "NOK"
gpu_h = {}
if device_id != -1:
gpu_h['deviceID'] = device_id
if device_bus != '':
gpu_h['devicePCIBusID'] = device_bus
gpu_h['deviceHandle'] = handle
# fetching gpu status
gpu_h = get_device_status(gpu_h)
# DEVICE_LIST.append(gpu_ref)
return gpu_h
# collects the perf data (only numeric values)
def collect_perf_data(DEVICE_LIST, sensor_list_ref):
sensor_list = ()
perf_data = []
for device in DEVICE_LIST:
# fetch the desired sensors
if(sensor_list_ref):
sensor_list = sensor_list_ref.split(',')
else:
# if no sensor is given via -T, we dump all
sensor_list = device.keys()
dev_perf_data = {}
for k in sensor_list:
dev_perf_data[k] = device[k]
perf_data.append(dev_perf_data)
return perf_data
# checks if the given performance data is in its rangens
def check_perf_threshold(perf_data, warn_list, crit_list):
status_level = ["OK"]
warn_level = [] # warning sensors
crit_level = [] # crit sensors
if warn_list:
for i in range(len(warn_list)):
# everything, except that values that should stay default, get new values
# e.g. -w d,15,60 changes the warning level for sensor 2 and 3 but not for 1
if warn_list[i] != 'd':
if (i == 0):
PERF_THRESHOLDS['GPUTemperature'][0] = warn_list[i]
elif (i == 1):
PERF_THRESHOLDS['usedMemory'][0] = warn_list[i]
elif (i == 2):
PERF_THRESHOLDS['fanSpeed'][0] = warn_list[i]
elif (i == 3):
PERF_THRESHOLDS['ECCMemAggSgl'][0] = warn_list[i]
elif (i == 4):
PERF_THRESHOLDS['ECCL1AggSgl'][0] = warn_list[i]
elif (i == 5):
PERF_THRESHOLDS['ECCL2AggSgl'][0] = warn_list[i]
elif (i == 6):
PERF_THRESHOLDS['ECCRegAggSgl'][0] = warn_list[i]
elif (i == 7):
PERF_THRESHOLDS['ECCTexAggSgl'][0] = warn_list[i]
elif (i == 8):
PERF_THRESHOLDS['PWRUsage'][0] = warn_list[i]
if crit_list:
for i in range(len(crit_list)):
if crit_list[i] != 'd':
if (i == 0):
PERF_THRESHOLDS['GPUTemperature'][1] = crit_list[i]
elif(i == 1):
PERF_THRESHOLDS['usedMemory'][1] = crit_list[i]
elif(i == 2):
PERF_THRESHOLDS['fanSpeed'][1] = crit_list[i]
elif(i == 3):
PERF_THRESHOLDS['ECCMemAggSgl'][1] = crit_list[i]
elif(i == 4):
PERF_THRESHOLDS['ECCL1AggSgl'][1] = crit_list[i]
elif(i == 5):
PERF_THRESHOLDS['ECCL2AggSgl'][1] = crit_list[i]
elif(i == 6):
PERF_THRESHOLDS['ECCRegAggSgl'][1] = crit_list[i]
elif(i == 7):
PERF_THRESHOLDS['ECCTexAggSgl'][1] = crit_list[i]
elif(i == 8):
PERF_THRESHOLDS['PWRUsage'][1] = crit_list[i]
# configure thresholds here, but the sensors are treated as discre
elif(i == 9):
PERF_THRESHOLDS['PCIeLinkGen'][0] = crit_list[i]
elif(i == 10):
PERF_THRESHOLDS['PCIeLinkWidth'][0] = crit_list[i]
# fetch the perfdata of the gpu
for k in perf_data.keys():
if k in PERF_THRESHOLDS:
# warning level
if perf_data[k] >= PERF_THRESHOLDS[k][0]:
status_level[0] = "WARNING"
warn_level.append(k)
# critival level
if perf_data[k] >= PERF_THRESHOLDS[k][1]:
status_level[0] = "CRITICAL"
warn_level.pop() # as it is critical, remove it from warning
crit_level.append(k)
status_level.append(warn_level)
status_level.append(crit_level)
return status_level
def get_status_string(level, perf_data, curr_sensors, verbosity):
status_string = ""
# Collect performance data of warn and crit sensors
for sensor in curr_sensors:
status_string += "[" + sensor + ": " + level
if verbosity and sensor in perf_data:
status_string += " (" + str(perf_data[sensor])
if sensor in PERF_UNITS:
status_string += " " + PERF_UNITS[sensor]
status_string += ")"
status_string += "]"
return status_string
def get_perf_string(curr_sensors):
status_string = ""
i = 1
# Collect performance values followed by thresholds
for k in curr_sensors.keys():
status_string += k + "=" + str(curr_sensors[k])
# print warn and crit thresholds
if k in PERF_THRESHOLDS:
status_string += ";" + str(PERF_THRESHOLDS[k][0])
status_string += ";" + str(PERF_THRESHOLDS[k][1])
if i != len(curr_sensors):
status_string += " "
i += 1
return status_string
###############################################
# Main function
# Command line processing and device status collection
###############################################
def main():
# Initialize nvml library
try:
nvmlInit()
except NVMLError as error:
print("Debug: NVML initialization failed.")
print("Error: " + error)
exit(3)
parser = get_options()
args = parser.parse_args()
if args.V:
print(get_version())
exit(0)
# the desired gpu device to query
device_id = args.device
# device bus information
device_bus = args.device_bus
# query a specific sensor
sensor_list = args.sensors
# change thresholds for performance data
warn_threshold = args.warning
try:
warn_threshold = [(int(t) if t != 'd' else t)
for t in warn_threshold.split(',')
if t != '']
except ValueError:
print('Invalid parameter value for warning: %s' % warn_threshold)
exit(3)
crit_threshold = args.critical
try:
crit_threshold = [(int(t) if t != 'd' else t)
for t in crit_threshold.split(',')
if t != '']
except ValueError:
print('Invalid parameter value for critical: %s' % crit_threshold)
exit(3)
verbosity = args.verbose
# the device ID is not present
if device_id == -1 and device_bus == '':
print "Error: Valid PCI bus string or device ID is required."
print get_usage()
exit(3)
# Collect the informations about the device in the system
gpu_ref = get_all_device_status(device_id, device_bus)
if gpu_ref == "NOK":
print "Ensure to use a valid device id or device bus string."
exit(3)
DEVICE_LIST.append(gpu_ref)
perf_data = collect_perf_data(DEVICE_LIST, sensor_list)
status_level = check_perf_threshold(perf_data[0],
warn_threshold, crit_threshold)
# TODO: status_level = check_discrete_sensors(status_level);
# check return values of threshold and discrete sensor function
EXIT_CODE = 0 # Ok
if status_level[0] == "WARNING":
EXIT_CODE = 1 # Warning
if status_level[0] == "CRITICAL":
EXIT_CODE = 2 # Critical
status_string = "GPU " + status_level[0]
status_string += " - " + DEVICE_LIST[0]['productName'] + " "
status_string += get_status_string("Critical", perf_data[0],
status_level[2], verbosity)
status_string += get_status_string("Warning", perf_data[0],
status_level[1], verbosity)
status_string += "|"
status_string += get_perf_string(perf_data[0])
# TODO: print "\n".get_verbose_string(verbosity,DEVICE_LIST[0],show_na);
print(status_string)
# shutdown nvml library
try:
nvmlShutdown()
except NVMLError as error:
print "Debug: NVML shutdown failed."
print "Error: " + error
exit(3)
exit(EXIT_CODE)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment