panisson · August 8, 2019 14:38
diff --git a/check_gpu_sensor.py b/check_gpu_sensor.py
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 # check_gpu_sensor: Nagios/Icinga plugin to check GPU sensors
 #
 # Copyright (C) 2019 Andre Panisson (Python2 script),
 # Copyright (C) 2011-2013 Thomas-Krenn.AG (Perl script),
 # originally released at https://github.com/thomas-krenn/check_gpu_sensor_v1
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program; if not, see <http://www.gnu.org/licenses/>.
 #
 ###############################################################################
 # The following guides provide helpful information if you want to extend this
 # script:
 #  http://nagiosplug.sourceforge.net/developer-guidelines.html (plug-in
 #  development guidelines)
 ###############################################################################
 import re
 import argparse
 from argparse import RawTextHelpFormatter
 from pynvml import *

 DEVICE_LIST = []  # Array of GPUs in current system

 # warning and critical default threshold levels
 PERF_THRESHOLDS = dict(
    GPUTemperature=['85', '100'],  # Temperature
    usedMemory=['95', '99'],  # Memory utilizaion
    fanSpeed=['80', '95'],  # Fan speed
    # only single ecc errors are configurable
    # double ecc errors are treated as discrete sensors and issue
    # a critical status
    ECCMemAggSgl=['1', '2'],  # Dev memory ecc errors
    ECCL1AggSgl=['1', '2'],  # L1 cache ecc errors
    ECCL2AggSgl=['1', '2'],  # L2 cache ecc errors
    ECCRegAggSgl=['1', '2'],  # Dev register ecc errors
    ECCTexAggSgl=['1', '2'],  # Dev texture cache ecc errors
    PWRUsage=['150', '200'],  # Power Usage
    PCIeLinkGen=['2'],  # PCIe link generation
    PCIeLinkWidth=['16'],  # PCIe link width
 )

 PERF_UNITS = dict(
    GPUTemperature='degrees',
    usedMemory='%'
 )


 ###############################################
 # Plugin specific functions
 # They return help messages and version information
 ###############################################
 def get_version():
    return """check_gpu_sensor version 1.0 20190908
 Based on Perl version 2.3 20130610
 Copyright (C) 2019 Andre Panisson
 Copyright (C) 2011-2013 Thomas-Krenn.AG (written by Georg Schönberger)
 Your system is using NVIDIA driver version %s with
 NVML version %s""" % (get_driver_version(), get_nvml_version())


 def get_options():
    # Parse command line options
    parser = argparse.ArgumentParser(
        prog='check_gpu_sensor',
        description="Nagios/Icinga plugin to check GPU sensors via NVML",
        formatter_class=RawTextHelpFormatter)

    parser.add_argument('-d', '--device', default=-1,
                        type=int, action='store',
                        metavar='<device id>', help="""\
 Return information about the GPU with the given device ID. IDs
 can be checked with the nvidia-smi tool. Attention: It is not
 ensured that device IDs are persistent across reboots. The safest
 way is to define a device bus string via '-db' which can be found
 with nvidia-smi.""")
    parser.add_argument('-db', '--device-bus', default='',
                        type=str, action='store',
                        metavar='<pci bus string>', help="""\
 Check the GPU with the corresponding pci bus device string. The pci bus
 ID can be found out with:
 nvidia-smi -a |grep 'Bus Id'.
 An example string is '0000:01:00.0' that can be used to call the plugin:
 ./check_gpu_sensor -db '0000:01:00.0'""")
    parser.add_argument('-T', '--sensors', default='',
                        type=str, action='store',
                        metavar='<sensor type>', help="""\
 limit sensors to query based on NVML sensor types.
 The sensors are currently working for performance data sensors.
 Examples for GPU sensor types are 'GPUTemperature',
 'usedMemory','fanSpeed'""")
    parser.add_argument('-w', '--warning', default='',
                        type=str, action='store',
                        metavar='<list of warning thresholds>', help="""\
 Change the default warning levels (also consider to use a config file
 instead ('-cf'). The order of the levels
 is the following:
 -GPUTemperature
 -usedMemory
 -fanSpeed
 -ECCMemAggSgl
 -ECCL1AggSgl
 -ECCL2AggSgl
 -ECCRegAggSgl
 -ECCTexAggSgl
 -PWRUsage
 Levels that should stay default get a 'd' assigned.
 Example:
   check_gpu_sensor -w '75,d,d,d,d,d,d,d,d'
 This changes the warning level for the temperature.""")
    parser.add_argument('-c', '--critical', default='',
                        type=str, action='store',
                        metavar='<list of critical thresholds>', help="""\
 Change the default critical levels. The order of the levels
 is the same as for the warning levels, moreover two more items
 can be listed:
 -PCIeLinkGen
 -PCIeLinkWidth
 Levels that should stay default get a 'd' assigned.
 Example:
   check_gpu_sensor -c '100,d,d,d,d,d,d,d,d,3,16'
 This changes the critical level for the temperature, the PCIe link
 generation to '3' and the link width to '16'.""")
    parser.add_argument('-v', '--verbose', action='count',
                        help="""\
 be verbose
 (no -v) .. single line output
 -v ..... single line output with additional details for warnings
 -vv ..... multi line output, also with additional details for warnings
 -vvv ..... normal output, then debugging output, followed by normal
      multi line output""")
    parser.add_argument('-V', help="show version information",
                        action="store_true")

    return parser


 ###############################################
 # Helper functions
 # They check for errors and print several structs
 # They also generate status outputs and verbose information
 ###############################################


 ###############################################
 # System specific functions
 # They are used to collect information about the current system
 ###############################################
 def get_nvml_version():
    # Working since 3.295.41
    version = get_driver_version()
    version, _ = re.findall(r"(\d+)\.(\d+)", version)[0]
    if(version >= 295):
        version = nvmlSystemGetNVMLVersion()
        return version
    else:
        return "not yet supported"


 def get_driver_version():
    try:
        version = nvmlSystemGetDriverVersion()
    except NVMLError as error:
        print("Error: " + error + ".\n")
        exit(3)
    return version


 def get_device_count():
    try:
        count = nvmlDeviceGetCount()
    except NVMLError as error:
        print("Error: " + error + ".\n")
        exit(3)
    return count


 def get_device_memory(deviceHandle):
    try:
        memory_info = nvmlDeviceGetMemoryInfo(deviceHandle)
        used_memory = 100 * memory_info.used / memory_info.total
        return used_memory
    except NVMLError as error:
        print("Error: Cannot get memory info for device: " + error)
        exit(3)


 def get_device_status(current_device):

    deviceHandle = current_device['deviceHandle']
    current_device['productName'] = nvmlDeviceGetName(deviceHandle)
    current_device['deviceComputeMode'] = nvmlDeviceGetComputeMode(deviceHandle)
    current_device['fanSpeed'] = nvmlDeviceGetFanSpeed(deviceHandle)
    current_device['GPUTemperature'] = nvmlDeviceGetTemperature(deviceHandle, NVML_TEMPERATURE_GPU)
    current_device['devicePciInfo'] = nvmlDeviceGetPciInfo(deviceHandle)

    current_device['usedMemory'] = get_device_memory(deviceHandle)

    # TODO: implement monitoring of other sensors

    # current_device['utilizationRates'] = get_device_util(deviceHandle)
    # current_device['nvmlClockInfo'] = get_device_clock(deviceHandle)
    # current_device['nvmlDeviceInforom'] = get_device_inforom(deviceHandle)
    # current_device['nvmlDeviceEccInfos'] = get_device_ecc(deviceHandle)
    # current_device['nvmlDevicePowerInfos'] = get_device_power(deviceHandle)
    # current_device['persistenceMode'] = get_persistence_mode(deviceHandle)
    # current_device['inforomValid'] = get_inforom_validation(deviceHandle)
    # current_device['throttleReasons'] = get_throttle_reasons(deviceHandle)
    # current_device['PCIeLink'] = get_pcie_link(deviceHandle)

    return current_device


 ###############################################
 # Overall device functions
 # They collect functions for a GPU in the current system
 ###############################################
 def get_all_device_status(device_id, device_bus):

    count = get_device_count()
    if count == 0:
        print("Error: No NVIDIA device found in current system.")
        exit(3)

    if device_bus != '':
        try:
            handle = nvmlDeviceGetHandleByPciBusId(device_bus)
        except NVMLError as error:
            print("Error: Cannot get handle for device bus ID: " + error)
            return "NOK"
    else:
        if device_id != -1:
            try:
                handle = nvmlDeviceGetHandleByIndex(device_id)
            except NVMLError as error:
                print("Error: Cannot get handle for device bus ID: " + error)
                return "NOK"

    gpu_h = {}

    if device_id != -1:
        gpu_h['deviceID'] = device_id

    if device_bus != '':
        gpu_h['devicePCIBusID'] = device_bus

    gpu_h['deviceHandle'] = handle
    # fetching gpu status
    gpu_h = get_device_status(gpu_h)
    # DEVICE_LIST.append(gpu_ref)
    return gpu_h


 # collects the perf data (only numeric values)
 def collect_perf_data(DEVICE_LIST, sensor_list_ref):

    sensor_list = ()
    perf_data = []

    for device in DEVICE_LIST:
        # fetch the desired sensors
        if(sensor_list_ref):
            sensor_list = sensor_list_ref.split(',')
        else:
            # if no sensor is given via -T, we dump all
            sensor_list = device.keys()

        dev_perf_data = {}
        for k in sensor_list:
            dev_perf_data[k] = device[k]
        perf_data.append(dev_perf_data)

    return perf_data


 # checks if the given performance data is in its rangens
 def check_perf_threshold(perf_data, warn_list, crit_list):
    status_level = ["OK"]
    warn_level = []  # warning sensors
    crit_level = []  # crit sensors

    if warn_list:
        for i in range(len(warn_list)):
            # everything, except that values that should stay default, get new values
            # e.g. -w d,15,60 changes the warning level for sensor 2 and 3 but not for 1
            if warn_list[i] != 'd':
                if (i == 0):
                    PERF_THRESHOLDS['GPUTemperature'][0] = warn_list[i]
                elif (i == 1):
                    PERF_THRESHOLDS['usedMemory'][0] = warn_list[i]
                elif (i == 2):
                    PERF_THRESHOLDS['fanSpeed'][0] = warn_list[i]
                elif (i == 3):
                    PERF_THRESHOLDS['ECCMemAggSgl'][0] = warn_list[i]
                elif (i == 4):
                    PERF_THRESHOLDS['ECCL1AggSgl'][0] = warn_list[i]
                elif (i == 5):
                    PERF_THRESHOLDS['ECCL2AggSgl'][0] = warn_list[i]
                elif (i == 6):
                    PERF_THRESHOLDS['ECCRegAggSgl'][0] = warn_list[i]
                elif (i == 7):
                    PERF_THRESHOLDS['ECCTexAggSgl'][0] = warn_list[i]
                elif (i == 8):
                    PERF_THRESHOLDS['PWRUsage'][0] = warn_list[i]

    if crit_list:
        for i in range(len(crit_list)):
            if crit_list[i] != 'd':
                if (i == 0):
                    PERF_THRESHOLDS['GPUTemperature'][1] = crit_list[i]
                elif(i == 1):
                    PERF_THRESHOLDS['usedMemory'][1] = crit_list[i]
                elif(i == 2):
                    PERF_THRESHOLDS['fanSpeed'][1] = crit_list[i]
                elif(i == 3):
                    PERF_THRESHOLDS['ECCMemAggSgl'][1] = crit_list[i]
                elif(i == 4):
                    PERF_THRESHOLDS['ECCL1AggSgl'][1] = crit_list[i]
                elif(i == 5):
                    PERF_THRESHOLDS['ECCL2AggSgl'][1] = crit_list[i]
                elif(i == 6):
                    PERF_THRESHOLDS['ECCRegAggSgl'][1] = crit_list[i]
                elif(i == 7):
                    PERF_THRESHOLDS['ECCTexAggSgl'][1] = crit_list[i]
                elif(i == 8):
                    PERF_THRESHOLDS['PWRUsage'][1] = crit_list[i]
                # configure thresholds here, but the sensors are treated as discre
                elif(i == 9):
                    PERF_THRESHOLDS['PCIeLinkGen'][0] = crit_list[i]
                elif(i == 10):
                    PERF_THRESHOLDS['PCIeLinkWidth'][0] = crit_list[i]

    # fetch the perfdata of the gpu
    for k in perf_data.keys():
        if k in PERF_THRESHOLDS:
            # warning level
            if perf_data[k] >= PERF_THRESHOLDS[k][0]:
                status_level[0] = "WARNING"
                warn_level.append(k)

            # critival level
            if perf_data[k] >= PERF_THRESHOLDS[k][1]:
                status_level[0] = "CRITICAL"
                warn_level.pop()  # as it is critical, remove it from warning
                crit_level.append(k)

    status_level.append(warn_level)
    status_level.append(crit_level)

    return status_level


 def get_status_string(level, perf_data, curr_sensors, verbosity):
    status_string = ""

    # Collect performance data of warn and crit sensors
    for sensor in curr_sensors:
        status_string += "[" + sensor + ": " + level
        if verbosity and sensor in perf_data:
            status_string += " (" + str(perf_data[sensor])
            if sensor in PERF_UNITS:
                status_string += " " + PERF_UNITS[sensor]
            status_string += ")"
        status_string += "]"

    return status_string


 def get_perf_string(curr_sensors):
    status_string = ""
    i = 1
    # Collect performance values followed by thresholds
    for k in curr_sensors.keys():
        status_string += k + "=" + str(curr_sensors[k])
        # print warn and crit thresholds
        if k in PERF_THRESHOLDS:
            status_string += ";" + str(PERF_THRESHOLDS[k][0])
            status_string += ";" + str(PERF_THRESHOLDS[k][1])

        if i != len(curr_sensors):
            status_string += " "

        i += 1
    return status_string


 ###############################################
 # Main function
 # Command line processing and device status collection
 ###############################################
 def main():

    # Initialize nvml library
    try:
        nvmlInit()
    except NVMLError as error:
        print("Debug: NVML initialization failed.")
        print("Error: " + error)
        exit(3)

    parser = get_options()
    args = parser.parse_args()

    if args.V:
        print(get_version())
        exit(0)

    # the desired gpu device to query
    device_id = args.device
    # device bus information
    device_bus = args.device_bus
    # query a specific sensor
    sensor_list = args.sensors

    # change thresholds for performance data
    warn_threshold = args.warning
    try:
        warn_threshold = [(int(t) if t != 'd' else t)
                          for t in warn_threshold.split(',')
                          if t != '']
    except ValueError:
        print('Invalid parameter value for warning: %s' % warn_threshold)
        exit(3)

    crit_threshold = args.critical
    try:
        crit_threshold = [(int(t) if t != 'd' else t)
                          for t in crit_threshold.split(',')
                          if t != '']
    except ValueError:
        print('Invalid parameter value for critical: %s' % crit_threshold)
        exit(3)

    verbosity = args.verbose

    # the device ID is not present
    if device_id == -1 and device_bus == '':
        print "Error: Valid PCI bus string or device ID is required."
        print get_usage()
        exit(3)

    # Collect the informations about the device in the system
    gpu_ref = get_all_device_status(device_id, device_bus)
    if gpu_ref == "NOK":
        print "Ensure to use a valid device id or device bus string."
        exit(3)

    DEVICE_LIST.append(gpu_ref)

    perf_data = collect_perf_data(DEVICE_LIST, sensor_list)
    status_level = check_perf_threshold(perf_data[0],
                                        warn_threshold, crit_threshold)
    # TODO: status_level = check_discrete_sensors(status_level);
    # check return values of threshold and discrete sensor function
    EXIT_CODE = 0  # Ok
    if status_level[0] == "WARNING":
        EXIT_CODE = 1  # Warning
    if status_level[0] == "CRITICAL":
        EXIT_CODE = 2  # Critical

    status_string = "GPU " + status_level[0]
    status_string += " - " + DEVICE_LIST[0]['productName'] + " "
    status_string += get_status_string("Critical", perf_data[0],
                                       status_level[2], verbosity)
    status_string += get_status_string("Warning", perf_data[0],
                                       status_level[1], verbosity)
    status_string += "|"
    status_string += get_perf_string(perf_data[0])

    # TODO: print "\n".get_verbose_string(verbosity,DEVICE_LIST[0],show_na);

    print(status_string)

    # shutdown nvml library
    try:
        nvmlShutdown()
    except NVMLError as error:
        print "Debug: NVML shutdown failed."
        print "Error: " + error
        exit(3)

    exit(EXIT_CODE)


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python2
	# -- coding: utf-8 --
	# check_gpu_sensor: Nagios/Icinga plugin to check GPU sensors
	#
	# Copyright (C) 2019 Andre Panisson (Python2 script),
	# Copyright (C) 2011-2013 Thomas-Krenn.AG (Perl script),
	# originally released at https://github.com/thomas-krenn/check_gpu_sensor_v1
	#
	# This program is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free Software
	# Foundation; either version 3 of the License, or (at your option) any later
	# version.
	#
	# This program is distributed in the hope that it will be useful, but WITHOUT
	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	# details.
	#
	# You should have received a copy of the GNU General Public License along with
	# this program; if not, see <http://www.gnu.org/licenses/>.
	#
	###############################################################################
	# The following guides provide helpful information if you want to extend this
	# script:
	# http://nagiosplug.sourceforge.net/developer-guidelines.html (plug-in
	# development guidelines)
	###############################################################################
	import re
	import argparse
	from argparse import RawTextHelpFormatter
	from pynvml import *

	DEVICE_LIST = [] # Array of GPUs in current system

	# warning and critical default threshold levels
	PERF_THRESHOLDS = dict(
	GPUTemperature=['85', '100'], # Temperature
	usedMemory=['95', '99'], # Memory utilizaion
	fanSpeed=['80', '95'], # Fan speed
	# only single ecc errors are configurable
	# double ecc errors are treated as discrete sensors and issue
	# a critical status
	ECCMemAggSgl=['1', '2'], # Dev memory ecc errors
	ECCL1AggSgl=['1', '2'], # L1 cache ecc errors
	ECCL2AggSgl=['1', '2'], # L2 cache ecc errors
	ECCRegAggSgl=['1', '2'], # Dev register ecc errors
	ECCTexAggSgl=['1', '2'], # Dev texture cache ecc errors
	PWRUsage=['150', '200'], # Power Usage
	PCIeLinkGen=['2'], # PCIe link generation
	PCIeLinkWidth=['16'], # PCIe link width
	)

	PERF_UNITS = dict(
	GPUTemperature='degrees',
	usedMemory='%'
	)


	###############################################
	# Plugin specific functions
	# They return help messages and version information
	###############################################
	def get_version():
	return """check_gpu_sensor version 1.0 20190908
	Based on Perl version 2.3 20130610
	Copyright (C) 2019 Andre Panisson
	Copyright (C) 2011-2013 Thomas-Krenn.AG (written by Georg Schönberger)
	Your system is using NVIDIA driver version %s with
	NVML version %s""" % (get_driver_version(), get_nvml_version())


	def get_options():
	# Parse command line options
	parser = argparse.ArgumentParser(
	prog='check_gpu_sensor',
	description="Nagios/Icinga plugin to check GPU sensors via NVML",
	formatter_class=RawTextHelpFormatter)

	parser.add_argument('-d', '--device', default=-1,
	type=int, action='store',
	metavar='<device id>', help="""\
	Return information about the GPU with the given device ID. IDs
	can be checked with the nvidia-smi tool. Attention: It is not
	ensured that device IDs are persistent across reboots. The safest
	way is to define a device bus string via '-db' which can be found
	with nvidia-smi.""")
	parser.add_argument('-db', '--device-bus', default='',
	type=str, action='store',
	metavar='<pci bus string>', help="""\
	Check the GPU with the corresponding pci bus device string. The pci bus
	ID can be found out with:
	nvidia-smi -a \|grep 'Bus Id'.
	An example string is '0000:01:00.0' that can be used to call the plugin:
	./check_gpu_sensor -db '0000:01:00.0'""")
	parser.add_argument('-T', '--sensors', default='',
	type=str, action='store',
	metavar='<sensor type>', help="""\
	limit sensors to query based on NVML sensor types.
	The sensors are currently working for performance data sensors.
	Examples for GPU sensor types are 'GPUTemperature',
	'usedMemory','fanSpeed'""")
	parser.add_argument('-w', '--warning', default='',
	type=str, action='store',
	metavar='<list of warning thresholds>', help="""\
	Change the default warning levels (also consider to use a config file
	instead ('-cf'). The order of the levels
	is the following:
	-GPUTemperature
	-usedMemory
	-fanSpeed
	-ECCMemAggSgl
	-ECCL1AggSgl
	-ECCL2AggSgl
	-ECCRegAggSgl
	-ECCTexAggSgl
	-PWRUsage
	Levels that should stay default get a 'd' assigned.
	Example:
	check_gpu_sensor -w '75,d,d,d,d,d,d,d,d'
	This changes the warning level for the temperature.""")
	parser.add_argument('-c', '--critical', default='',
	type=str, action='store',
	metavar='<list of critical thresholds>', help="""\
	Change the default critical levels. The order of the levels
	is the same as for the warning levels, moreover two more items
	can be listed:
	-PCIeLinkGen
	-PCIeLinkWidth
	Levels that should stay default get a 'd' assigned.
	Example:
	check_gpu_sensor -c '100,d,d,d,d,d,d,d,d,3,16'
	This changes the critical level for the temperature, the PCIe link
	generation to '3' and the link width to '16'.""")
	parser.add_argument('-v', '--verbose', action='count',
	help="""\
	be verbose
	(no -v) .. single line output
	-v ..... single line output with additional details for warnings
	-vv ..... multi line output, also with additional details for warnings
	-vvv ..... normal output, then debugging output, followed by normal
	multi line output""")
	parser.add_argument('-V', help="show version information",
	action="store_true")

	return parser


	###############################################
	# Helper functions
	# They check for errors and print several structs
	# They also generate status outputs and verbose information
	###############################################


	###############################################
	# System specific functions
	# They are used to collect information about the current system
	###############################################
	def get_nvml_version():
	# Working since 3.295.41
	version = get_driver_version()
	version, _ = re.findall(r"(\d+)\.(\d+)", version)[0]
	if(version >= 295):
	version = nvmlSystemGetNVMLVersion()
	return version
	else:
	return "not yet supported"


	def get_driver_version():
	try:
	version = nvmlSystemGetDriverVersion()
	except NVMLError as error:
	print("Error: " + error + ".\n")
	exit(3)
	return version


	def get_device_count():
	try:
	count = nvmlDeviceGetCount()
	except NVMLError as error:
	print("Error: " + error + ".\n")
	exit(3)
	return count


	def get_device_memory(deviceHandle):
	try:
	memory_info = nvmlDeviceGetMemoryInfo(deviceHandle)
	used_memory = 100 * memory_info.used / memory_info.total
	return used_memory
	except NVMLError as error:
	print("Error: Cannot get memory info for device: " + error)
	exit(3)


	def get_device_status(current_device):

	deviceHandle = current_device['deviceHandle']
	current_device['productName'] = nvmlDeviceGetName(deviceHandle)
	current_device['deviceComputeMode'] = nvmlDeviceGetComputeMode(deviceHandle)
	current_device['fanSpeed'] = nvmlDeviceGetFanSpeed(deviceHandle)
	current_device['GPUTemperature'] = nvmlDeviceGetTemperature(deviceHandle, NVML_TEMPERATURE_GPU)
	current_device['devicePciInfo'] = nvmlDeviceGetPciInfo(deviceHandle)

	current_device['usedMemory'] = get_device_memory(deviceHandle)

	# TODO: implement monitoring of other sensors

	# current_device['utilizationRates'] = get_device_util(deviceHandle)
	# current_device['nvmlClockInfo'] = get_device_clock(deviceHandle)
	# current_device['nvmlDeviceInforom'] = get_device_inforom(deviceHandle)
	# current_device['nvmlDeviceEccInfos'] = get_device_ecc(deviceHandle)
	# current_device['nvmlDevicePowerInfos'] = get_device_power(deviceHandle)
	# current_device['persistenceMode'] = get_persistence_mode(deviceHandle)
	# current_device['inforomValid'] = get_inforom_validation(deviceHandle)
	# current_device['throttleReasons'] = get_throttle_reasons(deviceHandle)
	# current_device['PCIeLink'] = get_pcie_link(deviceHandle)

	return current_device


	###############################################
	# Overall device functions
	# They collect functions for a GPU in the current system
	###############################################
	def get_all_device_status(device_id, device_bus):

	count = get_device_count()
	if count == 0:
	print("Error: No NVIDIA device found in current system.")
	exit(3)

	if device_bus != '':
	try:
	handle = nvmlDeviceGetHandleByPciBusId(device_bus)
	except NVMLError as error:
	print("Error: Cannot get handle for device bus ID: " + error)
	return "NOK"
	else:
	if device_id != -1:
	try:
	handle = nvmlDeviceGetHandleByIndex(device_id)
	except NVMLError as error:
	print("Error: Cannot get handle for device bus ID: " + error)
	return "NOK"

	gpu_h = {}

	if device_id != -1:
	gpu_h['deviceID'] = device_id

	if device_bus != '':
	gpu_h['devicePCIBusID'] = device_bus

	gpu_h['deviceHandle'] = handle
	# fetching gpu status
	gpu_h = get_device_status(gpu_h)
	# DEVICE_LIST.append(gpu_ref)
	return gpu_h


	# collects the perf data (only numeric values)
	def collect_perf_data(DEVICE_LIST, sensor_list_ref):

	sensor_list = ()
	perf_data = []

	for device in DEVICE_LIST:
	# fetch the desired sensors
	if(sensor_list_ref):
	sensor_list = sensor_list_ref.split(',')
	else:
	# if no sensor is given via -T, we dump all
	sensor_list = device.keys()

	dev_perf_data = {}
	for k in sensor_list:
	dev_perf_data[k] = device[k]
	perf_data.append(dev_perf_data)

	return perf_data


	# checks if the given performance data is in its rangens
	def check_perf_threshold(perf_data, warn_list, crit_list):
	status_level = ["OK"]
	warn_level = [] # warning sensors
	crit_level = [] # crit sensors

	if warn_list:
	for i in range(len(warn_list)):
	# everything, except that values that should stay default, get new values
	# e.g. -w d,15,60 changes the warning level for sensor 2 and 3 but not for 1
	if warn_list[i] != 'd':
	if (i == 0):
	PERF_THRESHOLDS['GPUTemperature'][0] = warn_list[i]
	elif (i == 1):
	PERF_THRESHOLDS['usedMemory'][0] = warn_list[i]
	elif (i == 2):
	PERF_THRESHOLDS['fanSpeed'][0] = warn_list[i]
	elif (i == 3):
	PERF_THRESHOLDS['ECCMemAggSgl'][0] = warn_list[i]
	elif (i == 4):
	PERF_THRESHOLDS['ECCL1AggSgl'][0] = warn_list[i]
	elif (i == 5):
	PERF_THRESHOLDS['ECCL2AggSgl'][0] = warn_list[i]
	elif (i == 6):
	PERF_THRESHOLDS['ECCRegAggSgl'][0] = warn_list[i]
	elif (i == 7):
	PERF_THRESHOLDS['ECCTexAggSgl'][0] = warn_list[i]
	elif (i == 8):
	PERF_THRESHOLDS['PWRUsage'][0] = warn_list[i]

	if crit_list:
	for i in range(len(crit_list)):
	if crit_list[i] != 'd':
	if (i == 0):
	PERF_THRESHOLDS['GPUTemperature'][1] = crit_list[i]
	elif(i == 1):
	PERF_THRESHOLDS['usedMemory'][1] = crit_list[i]
	elif(i == 2):
	PERF_THRESHOLDS['fanSpeed'][1] = crit_list[i]
	elif(i == 3):
	PERF_THRESHOLDS['ECCMemAggSgl'][1] = crit_list[i]
	elif(i == 4):
	PERF_THRESHOLDS['ECCL1AggSgl'][1] = crit_list[i]
	elif(i == 5):
	PERF_THRESHOLDS['ECCL2AggSgl'][1] = crit_list[i]
	elif(i == 6):
	PERF_THRESHOLDS['ECCRegAggSgl'][1] = crit_list[i]
	elif(i == 7):
	PERF_THRESHOLDS['ECCTexAggSgl'][1] = crit_list[i]
	elif(i == 8):
	PERF_THRESHOLDS['PWRUsage'][1] = crit_list[i]
	# configure thresholds here, but the sensors are treated as discre
	elif(i == 9):
	PERF_THRESHOLDS['PCIeLinkGen'][0] = crit_list[i]
	elif(i == 10):
	PERF_THRESHOLDS['PCIeLinkWidth'][0] = crit_list[i]

	# fetch the perfdata of the gpu
	for k in perf_data.keys():
	if k in PERF_THRESHOLDS:
	# warning level
	if perf_data[k] >= PERF_THRESHOLDS[k][0]:
	status_level[0] = "WARNING"
	warn_level.append(k)

	# critival level
	if perf_data[k] >= PERF_THRESHOLDS[k][1]:
	status_level[0] = "CRITICAL"
	warn_level.pop() # as it is critical, remove it from warning
	crit_level.append(k)

	status_level.append(warn_level)
	status_level.append(crit_level)

	return status_level


	def get_status_string(level, perf_data, curr_sensors, verbosity):
	status_string = ""

	# Collect performance data of warn and crit sensors
	for sensor in curr_sensors:
	status_string += "[" + sensor + ": " + level
	if verbosity and sensor in perf_data:
	status_string += " (" + str(perf_data[sensor])
	if sensor in PERF_UNITS:
	status_string += " " + PERF_UNITS[sensor]
	status_string += ")"
	status_string += "]"

	return status_string


	def get_perf_string(curr_sensors):
	status_string = ""
	i = 1
	# Collect performance values followed by thresholds
	for k in curr_sensors.keys():
	status_string += k + "=" + str(curr_sensors[k])
	# print warn and crit thresholds
	if k in PERF_THRESHOLDS:
	status_string += ";" + str(PERF_THRESHOLDS[k][0])
	status_string += ";" + str(PERF_THRESHOLDS[k][1])

	if i != len(curr_sensors):
	status_string += " "

	i += 1
	return status_string


	###############################################
	# Main function
	# Command line processing and device status collection
	###############################################
	def main():

	# Initialize nvml library
	try:
	nvmlInit()
	except NVMLError as error:
	print("Debug: NVML initialization failed.")
	print("Error: " + error)
	exit(3)

	parser = get_options()
	args = parser.parse_args()

	if args.V:
	print(get_version())
	exit(0)

	# the desired gpu device to query
	device_id = args.device
	# device bus information
	device_bus = args.device_bus
	# query a specific sensor
	sensor_list = args.sensors

	# change thresholds for performance data
	warn_threshold = args.warning
	try:
	warn_threshold = [(int(t) if t != 'd' else t)
	for t in warn_threshold.split(',')
	if t != '']
	except ValueError:
	print('Invalid parameter value for warning: %s' % warn_threshold)
	exit(3)

	crit_threshold = args.critical
	try:
	crit_threshold = [(int(t) if t != 'd' else t)
	for t in crit_threshold.split(',')
	if t != '']
	except ValueError:
	print('Invalid parameter value for critical: %s' % crit_threshold)
	exit(3)

	verbosity = args.verbose

	# the device ID is not present
	if device_id == -1 and device_bus == '':
	print "Error: Valid PCI bus string or device ID is required."
	print get_usage()
	exit(3)

	# Collect the informations about the device in the system
	gpu_ref = get_all_device_status(device_id, device_bus)
	if gpu_ref == "NOK":
	print "Ensure to use a valid device id or device bus string."
	exit(3)

	DEVICE_LIST.append(gpu_ref)

	perf_data = collect_perf_data(DEVICE_LIST, sensor_list)
	status_level = check_perf_threshold(perf_data[0],
	warn_threshold, crit_threshold)
	# TODO: status_level = check_discrete_sensors(status_level);
	# check return values of threshold and discrete sensor function
	EXIT_CODE = 0 # Ok
	if status_level[0] == "WARNING":
	EXIT_CODE = 1 # Warning
	if status_level[0] == "CRITICAL":
	EXIT_CODE = 2 # Critical

	status_string = "GPU " + status_level[0]
	status_string += " - " + DEVICE_LIST[0]['productName'] + " "
	status_string += get_status_string("Critical", perf_data[0],
	status_level[2], verbosity)
	status_string += get_status_string("Warning", perf_data[0],
	status_level[1], verbosity)
	status_string += "\|"
	status_string += get_perf_string(perf_data[0])

	# TODO: print "\n".get_verbose_string(verbosity,DEVICE_LIST[0],show_na);

	print(status_string)

	# shutdown nvml library
	try:
	nvmlShutdown()
	except NVMLError as error:
	print "Debug: NVML shutdown failed."
	print "Error: " + error
	exit(3)

	exit(EXIT_CODE)


	if __name__ == '__main__':
	main()