Created
April 2, 2024 19:11
-
-
Save pandeybk/66801a631e92050333644d441aedad50 to your computer and use it in GitHub Desktop.
NVIDIA DCGM Exporter Dashboard V2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"__requires": [ | |
{ | |
"type": "panel", | |
"id": "gauge", | |
"name": "Gauge", | |
"version": "" | |
}, | |
{ | |
"type": "grafana", | |
"id": "grafana", | |
"name": "Grafana", | |
"version": "6.7.3" | |
}, | |
{ | |
"type": "panel", | |
"id": "graph", | |
"name": "Graph", | |
"version": "" | |
}, | |
{ | |
"type": "datasource", | |
"id": "prometheus", | |
"name": "Prometheus", | |
"version": "1.0.0" | |
} | |
], | |
"annotations": { | |
"list": [ | |
{ | |
"$$hashKey": "object:192", | |
"builtIn": 1, | |
"datasource": "-- Grafana --", | |
"enable": true, | |
"hide": true, | |
"iconColor": "rgba(0, 211, 255, 1)", | |
"name": "Annotations & Alerts", | |
"type": "dashboard" | |
} | |
] | |
}, | |
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster", | |
"editable": true, | |
"gnetId": 12239, | |
"graphTooltip": 0, | |
"id": null, | |
"iteration": 1588401887165, | |
"links": [], | |
"panels": [ | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "$datasource", | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 8, | |
"w": 18, | |
"x": 0, | |
"y": 0 | |
}, | |
"hiddenSeries": false, | |
"id": 12, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": false, | |
"rightSide": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"nullPointMode": "null", | |
"options": { | |
"dataLinks": [] | |
}, | |
"percentage": false, | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
"instant": false, | |
"interval": "", | |
"legendFormat": "GPU {{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU Temperature", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "celsius", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"title": "Total Number of GPUs", | |
"type": "singlestat", | |
"datasource": "$datasource", | |
"targets": [ | |
{ | |
"expr": "count(DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\"})", | |
"format": "time_series", | |
"refId": "A" | |
} | |
], | |
"gridPos": { | |
"h": 8, | |
"w": 6, | |
"x": 0, | |
"y": 40 | |
}, | |
"id": 20, | |
"options": { | |
"fieldOptions": { | |
"calcs": [ | |
"lastNotNull" | |
], | |
"defaults": { | |
"unit": "none" | |
} | |
} | |
} | |
}, | |
{ | |
"datasource": "$datasource", | |
"gridPos": { | |
"h": 8, | |
"w": 6, | |
"x": 18, | |
"y": 0 | |
}, | |
"id": 14, | |
"options": { | |
"fieldOptions": { | |
"calcs": [ | |
"mean" | |
], | |
"defaults": { | |
"color": { | |
"mode": "thresholds" | |
}, | |
"mappings": [], | |
"max": 100, | |
"min": 0, | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "#EAB839", | |
"value": 83 | |
}, | |
{ | |
"color": "red", | |
"value": 87 | |
} | |
] | |
}, | |
"unit": "celsius" | |
}, | |
"overrides": [], | |
"values": false | |
}, | |
"orientation": "auto", | |
"showThresholdLabels": false, | |
"showThresholdMarkers": true | |
}, | |
"pluginVersion": "6.7.3", | |
"targets": [ | |
{ | |
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"})", | |
"interval": "", | |
"legendFormat": "", | |
"refId": "A" | |
} | |
], | |
"timeFrom": null, | |
"timeShift": null, | |
"title": "GPU Avg. Temp", | |
"type": "gauge" | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "$datasource", | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 8, | |
"w": 18, | |
"x": 0, | |
"y": 8 | |
}, | |
"hiddenSeries": false, | |
"id": 10, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": false, | |
"rightSide": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"nullPointMode": "null", | |
"options": { | |
"dataLinks": [] | |
}, | |
"percentage": false, | |
"pluginVersion": "6.5.2", | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
"interval": "", | |
"legendFormat": "GPU {{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU Power Usage", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "watt", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"cacheTimeout": null, | |
"datasource": "$datasource", | |
"gridPos": { | |
"h": 8, | |
"w": 6, | |
"x": 18, | |
"y": 8 | |
}, | |
"id": 16, | |
"links": [], | |
"options": { | |
"fieldOptions": { | |
"calcs": [ | |
"sum" | |
], | |
"defaults": { | |
"color": { | |
"mode": "thresholds" | |
}, | |
"mappings": [], | |
"max": 2400, | |
"min": 0, | |
"nullValueMode": "connected", | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "#EAB839", | |
"value": 1800 | |
}, | |
{ | |
"color": "red", | |
"value": 2200 | |
} | |
] | |
}, | |
"unit": "watt" | |
}, | |
"overrides": [], | |
"values": false | |
}, | |
"orientation": "horizontal", | |
"showThresholdLabels": false, | |
"showThresholdMarkers": true | |
}, | |
"pluginVersion": "6.7.3", | |
"targets": [ | |
{ | |
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"})", | |
"instant": true, | |
"interval": "", | |
"legendFormat": "", | |
"range": false, | |
"refId": "A" | |
} | |
], | |
"timeFrom": null, | |
"timeShift": null, | |
"title": "GPU Power Total", | |
"type": "gauge" | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "$datasource", | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 8, | |
"w": 12, | |
"x": 0, | |
"y": 16 | |
}, | |
"hiddenSeries": false, | |
"id": 2, | |
"interval": "", | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": false, | |
"rightSide": true, | |
"show": true, | |
"sideWidth": null, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"nullPointMode": "null", | |
"options": { | |
"dataLinks": [] | |
}, | |
"percentage": false, | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$instance\", gpu=~\"$gpu\"} * 1000000", | |
"format": "time_series", | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "GPU {{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU SM Clocks", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"decimals": null, | |
"format": "hertz", | |
"label": "", | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "$datasource", | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 8, | |
"w": 12, | |
"x": 0, | |
"y": 24 | |
}, | |
"hiddenSeries": false, | |
"id": 6, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": false, | |
"rightSide": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"nullPointMode": "null", | |
"options": { | |
"dataLinks": [] | |
}, | |
"percentage": false, | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
"interval": "", | |
"legendFormat": "GPU {{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU Utilization", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "cumulative" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "percent", | |
"label": null, | |
"logBase": 1, | |
"max": "100", | |
"min": "0", | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "$datasource", | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 8, | |
"w": 12, | |
"x": 0, | |
"y": 32 | |
}, | |
"hiddenSeries": false, | |
"id": 18, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": false, | |
"rightSide": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"nullPointMode": "null", | |
"options": { | |
"dataLinks": [] | |
}, | |
"percentage": false, | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
"interval": "", | |
"legendFormat": "GPU {{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU Framebuffer Mem Used", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "decmbytes", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "$datasource", | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 8, | |
"w": 12, | |
"x": 0, | |
"y": 24 | |
}, | |
"hiddenSeries": false, | |
"id": 4, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": false, | |
"rightSide": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"nullPointMode": "null", | |
"options": { | |
"dataLinks": [] | |
}, | |
"percentage": false, | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
"interval": "", | |
"legendFormat": "GPU {{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "Tensor Core Utilization", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "cumulative" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "percentunit", | |
"label": null, | |
"logBase": 1, | |
"max": "1", | |
"min": "0", | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
} | |
], | |
"refresh": false, | |
"schemaVersion": 22, | |
"style": "dark", | |
"tags": [], | |
"templating": { | |
"list": [ | |
{ | |
"current": { | |
"selected": true, | |
"text": "Prometheus", | |
"value": "Prometheus" | |
}, | |
"hide": 0, | |
"includeAll": false, | |
"multi": false, | |
"name": "datasource", | |
"options": [], | |
"query": "prometheus", | |
"queryValue": "", | |
"refresh": 1, | |
"regex": "", | |
"skipUrlSync": false, | |
"type": "datasource" | |
}, | |
{ | |
"allValue": null, | |
"current": {}, | |
"datasource": "$datasource", | |
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", | |
"hide": 0, | |
"includeAll": true, | |
"index": -1, | |
"label": null, | |
"multi": true, | |
"name": "instance", | |
"options": [], | |
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", | |
"refresh": 1, | |
"regex": "", | |
"skipUrlSync": false, | |
"sort": 1, | |
"tagValuesQuery": "", | |
"tags": [], | |
"tagsQuery": "", | |
"type": "query", | |
"useTags": false | |
}, | |
{ | |
"allValue": null, | |
"current": {}, | |
"datasource": "$datasource", | |
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", | |
"hide": 0, | |
"includeAll": true, | |
"index": -1, | |
"label": null, | |
"multi": true, | |
"name": "gpu", | |
"options": [], | |
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", | |
"refresh": 1, | |
"regex": "", | |
"skipUrlSync": false, | |
"sort": 1, | |
"tagValuesQuery": "", | |
"tags": [], | |
"tagsQuery": "", | |
"type": "query", | |
"useTags": false | |
} | |
] | |
}, | |
"time": { | |
"from": "now-15m", | |
"to": "now" | |
}, | |
"timepicker": { | |
"refresh_intervals": [ | |
"5s", | |
"10s", | |
"30s", | |
"1m", | |
"5m", | |
"15m", | |
"30m", | |
"1h", | |
"2h", | |
"1d" | |
] | |
}, | |
"timezone": "", | |
"title": "NVIDIA DCGM Exporter Dashboard V2", | |
"uid": "Oxed_c6Wz", | |
"variables": { | |
"list": [] | |
}, | |
"version": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment