Created
August 13, 2020 12:55
-
-
Save corenel/6312c37040eec418c09f41909af33fae to your computer and use it in GitHub Desktop.
GPU-Nodes-Metrics-Nvidia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "annotations": { | |
| "list": [ | |
| { | |
| "builtIn": 1, | |
| "datasource": "-- Grafana --", | |
| "enable": true, | |
| "hide": true, | |
| "iconColor": "rgba(0, 211, 255, 1)", | |
| "limit": 100, | |
| "name": "Annotations & Alerts", | |
| "showIn": 0, | |
| "type": "dashboard" | |
| } | |
| ] | |
| }, | |
| "description": "使用NVIDIA Data Center GPU Manager (DCGM) dcgm-exporter 通过Prometheus绘制的GPU Nvidia 基础监控信息.", | |
| "editable": true, | |
| "gnetId": 12639, | |
| "graphTooltip": 0, | |
| "id": 1, | |
| "iteration": 1597321805205, | |
| "links": [], | |
| "panels": [ | |
| { | |
| "cacheTimeout": null, | |
| "colorBackground": false, | |
| "colorValue": false, | |
| "colors": [ | |
| "#299c46", | |
| "rgba(237, 129, 40, 0.89)", | |
| "#d44a3a" | |
| ], | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {} | |
| }, | |
| "overrides": [] | |
| }, | |
| "format": "hertz", | |
| "gauge": { | |
| "maxValue": 100, | |
| "minValue": 0, | |
| "show": false, | |
| "thresholdLabels": false, | |
| "thresholdMarkers": true | |
| }, | |
| "gridPos": { | |
| "h": 2, | |
| "w": 12, | |
| "x": 0, | |
| "y": 0 | |
| }, | |
| "id": 44, | |
| "interval": null, | |
| "links": [], | |
| "mappingType": 1, | |
| "mappingTypes": [ | |
| { | |
| "name": "value to text", | |
| "value": 1 | |
| }, | |
| { | |
| "name": "range to text", | |
| "value": 2 | |
| } | |
| ], | |
| "maxDataPoints": 100, | |
| "nullPointMode": "connected", | |
| "nullText": null, | |
| "postfix": "", | |
| "postfixFontSize": "50%", | |
| "prefix": "", | |
| "prefixFontSize": "50%", | |
| "rangeMaps": [ | |
| { | |
| "from": "null", | |
| "text": "N/A", | |
| "to": "null" | |
| } | |
| ], | |
| "sparkline": { | |
| "fillColor": "rgba(31, 118, 189, 0.18)", | |
| "full": false, | |
| "lineColor": "rgb(31, 120, 193)", | |
| "show": false | |
| }, | |
| "tableColumn": "GPU SM 时钟", | |
| "targets": [ | |
| { | |
| "expr": "avg(DCGM_FI_DEV_SM_CLOCK{instance=~\"$hostname\"}*1000000)", | |
| "format": "time_series", | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "GPU SM 时钟", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": "", | |
| "title": "GPU SM Clocks", | |
| "type": "singlestat", | |
| "valueFontSize": "80%", | |
| "valueMaps": [ | |
| { | |
| "op": "=", | |
| "text": "N/A", | |
| "value": "null" | |
| } | |
| ], | |
| "valueName": "current" | |
| }, | |
| { | |
| "cacheTimeout": null, | |
| "colorBackground": false, | |
| "colorValue": false, | |
| "colors": [ | |
| "#299c46", | |
| "rgba(237, 129, 40, 0.89)", | |
| "#d44a3a" | |
| ], | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {} | |
| }, | |
| "overrides": [] | |
| }, | |
| "format": "hertz", | |
| "gauge": { | |
| "maxValue": 100, | |
| "minValue": 0, | |
| "show": false, | |
| "thresholdLabels": false, | |
| "thresholdMarkers": true | |
| }, | |
| "gridPos": { | |
| "h": 2, | |
| "w": 12, | |
| "x": 12, | |
| "y": 0 | |
| }, | |
| "id": 45, | |
| "interval": null, | |
| "links": [], | |
| "mappingType": 1, | |
| "mappingTypes": [ | |
| { | |
| "name": "value to text", | |
| "value": 1 | |
| }, | |
| { | |
| "name": "range to text", | |
| "value": 2 | |
| } | |
| ], | |
| "maxDataPoints": 100, | |
| "nullPointMode": "connected", | |
| "nullText": null, | |
| "postfix": "", | |
| "postfixFontSize": "50%", | |
| "prefix": "", | |
| "prefixFontSize": "50%", | |
| "rangeMaps": [ | |
| { | |
| "from": "null", | |
| "text": "N/A", | |
| "to": "null" | |
| } | |
| ], | |
| "sparkline": { | |
| "fillColor": "rgba(31, 118, 189, 0.18)", | |
| "full": false, | |
| "lineColor": "rgb(31, 120, 193)", | |
| "show": false | |
| }, | |
| "tableColumn": "GPU 内存时钟", | |
| "targets": [ | |
| { | |
| "expr": "avg(DCGM_FI_DEV_MEM_CLOCK{instance=~\"$hostname\"}*1000000)", | |
| "format": "time_series", | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "GPU 内存时钟", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": "", | |
| "title": "GPU Memory Clocks", | |
| "type": "singlestat", | |
| "valueFontSize": "80%", | |
| "valueMaps": [ | |
| { | |
| "op": "=", | |
| "text": "N/A", | |
| "value": "null" | |
| } | |
| ], | |
| "valueName": "current" | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {}, | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 0, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 7, | |
| "w": 8, | |
| "x": 0, | |
| "y": 2 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 57, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "links": [], | |
| "nullPointMode": "connected", | |
| "percentage": false, | |
| "pluginVersion": "7.1.3", | |
| "pointradius": 5, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$hostname\"}", | |
| "format": "time_series", | |
| "hide": false, | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "{{instance}}.{{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeFrom": null, | |
| "timeRegions": [], | |
| "timeShift": null, | |
| "title": "GPU 使用率", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "buckets": null, | |
| "mode": "time", | |
| "name": null, | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "percent", | |
| "label": null, | |
| "logBase": 1, | |
| "max": "100", | |
| "min": "0", | |
| "show": true | |
| }, | |
| { | |
| "format": "watt", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false, | |
| "alignLevel": null | |
| } | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": "Prometheus", | |
| "description": "内存使用", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {}, | |
| "links": [], | |
| "mappings": [], | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "red", | |
| "value": 80 | |
| } | |
| ] | |
| } | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 9, | |
| "w": 8, | |
| "x": 8, | |
| "y": 2 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 60, | |
| "legend": { | |
| "avg": false, | |
| "current": false, | |
| "max": false, | |
| "min": false, | |
| "show": true, | |
| "total": false, | |
| "values": false | |
| }, | |
| "lines": true, | |
| "linewidth": 1, | |
| "nullPointMode": "null", | |
| "percentage": false, | |
| "pluginVersion": "7.1.3", | |
| "pointradius": 2, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "expr": "DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}", | |
| "interval": "", | |
| "legendFormat": "{{instance}}.{{gpu}}", | |
| "refId": "B" | |
| }, | |
| { | |
| "expr": "DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}+DCGM_FI_DEV_FB_FREE{instance=~\"$hostname\"}", | |
| "hide": true, | |
| "interval": "", | |
| "legendFormat": "GPU 总内存 {{instance}}.{{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeFrom": null, | |
| "timeRegions": [], | |
| "timeShift": null, | |
| "title": "GPU 内存用量", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "buckets": null, | |
| "mode": "time", | |
| "name": null, | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "decmbytes", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false, | |
| "alignLevel": null | |
| } | |
| }, | |
| { | |
| "cacheTimeout": null, | |
| "colorBackground": false, | |
| "colorValue": false, | |
| "colors": [ | |
| "#299c46", | |
| "rgba(237, 129, 40, 0.89)", | |
| "#d44a3a" | |
| ], | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {} | |
| }, | |
| "overrides": [] | |
| }, | |
| "format": "celsius", | |
| "gauge": { | |
| "maxValue": 90, | |
| "minValue": 0, | |
| "show": true, | |
| "thresholdLabels": false, | |
| "thresholdMarkers": true | |
| }, | |
| "gridPos": { | |
| "h": 5, | |
| "w": 4, | |
| "x": 16, | |
| "y": 2 | |
| }, | |
| "id": 31, | |
| "interval": null, | |
| "links": [], | |
| "mappingType": 1, | |
| "mappingTypes": [ | |
| { | |
| "name": "value to text", | |
| "value": 1 | |
| }, | |
| { | |
| "name": "range to text", | |
| "value": 2 | |
| } | |
| ], | |
| "maxDataPoints": 100, | |
| "nullPointMode": "connected", | |
| "nullText": null, | |
| "postfix": "", | |
| "postfixFontSize": "50%", | |
| "prefix": "", | |
| "prefixFontSize": "50%", | |
| "rangeMaps": [ | |
| { | |
| "from": "null", | |
| "text": "N/A", | |
| "to": "null" | |
| } | |
| ], | |
| "sparkline": { | |
| "fillColor": "rgba(31, 118, 189, 0.18)", | |
| "full": false, | |
| "lineColor": "rgb(31, 120, 193)", | |
| "show": false | |
| }, | |
| "tableColumn": "", | |
| "targets": [ | |
| { | |
| "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$hostname\"})", | |
| "format": "time_series", | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": "83,87", | |
| "title": "GPU 平均温度", | |
| "type": "singlestat", | |
| "valueFontSize": "80%", | |
| "valueMaps": [ | |
| { | |
| "op": "=", | |
| "text": "N/A", | |
| "value": "null" | |
| } | |
| ], | |
| "valueName": "current" | |
| }, | |
| { | |
| "cacheTimeout": null, | |
| "colorBackground": false, | |
| "colorValue": false, | |
| "colors": [ | |
| "#299c46", | |
| "rgba(237, 129, 40, 0.89)", | |
| "#d44a3a" | |
| ], | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {} | |
| }, | |
| "overrides": [] | |
| }, | |
| "format": "watt", | |
| "gauge": { | |
| "maxValue": 2400, | |
| "minValue": 0, | |
| "show": true, | |
| "thresholdLabels": false, | |
| "thresholdMarkers": true | |
| }, | |
| "gridPos": { | |
| "h": 5, | |
| "w": 4, | |
| "x": 20, | |
| "y": 2 | |
| }, | |
| "id": 30, | |
| "interval": null, | |
| "links": [], | |
| "mappingType": 1, | |
| "mappingTypes": [ | |
| { | |
| "name": "value to text", | |
| "value": 1 | |
| }, | |
| { | |
| "name": "range to text", | |
| "value": 2 | |
| } | |
| ], | |
| "maxDataPoints": 100, | |
| "nullPointMode": "connected", | |
| "nullText": null, | |
| "postfix": "", | |
| "postfixFontSize": "50%", | |
| "prefix": "", | |
| "prefixFontSize": "50%", | |
| "rangeMaps": [ | |
| { | |
| "from": "null", | |
| "text": "N/A", | |
| "to": "null" | |
| } | |
| ], | |
| "sparkline": { | |
| "fillColor": "rgba(31, 118, 189, 0.18)", | |
| "full": false, | |
| "lineColor": "rgb(31, 120, 193)", | |
| "show": false | |
| }, | |
| "tableColumn": "", | |
| "targets": [ | |
| { | |
| "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$hostname\"})", | |
| "format": "time_series", | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": "1800,2200", | |
| "title": "GPU 总功率", | |
| "type": "singlestat", | |
| "valueFontSize": "80%", | |
| "valueMaps": [ | |
| { | |
| "op": "=", | |
| "text": "N/A", | |
| "value": "null" | |
| } | |
| ], | |
| "valueName": "current" | |
| }, | |
| { | |
| "cacheTimeout": null, | |
| "colorBackground": false, | |
| "colorValue": false, | |
| "colors": [ | |
| "#299c46", | |
| "rgba(237, 129, 40, 0.89)", | |
| "#d44a3a" | |
| ], | |
| "datasource": "Prometheus", | |
| "description": "", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {} | |
| }, | |
| "overrides": [] | |
| }, | |
| "format": "percent", | |
| "gauge": { | |
| "maxValue": 100, | |
| "minValue": 0, | |
| "show": true, | |
| "thresholdLabels": false, | |
| "thresholdMarkers": true | |
| }, | |
| "gridPos": { | |
| "h": 5, | |
| "w": 4, | |
| "x": 16, | |
| "y": 7 | |
| }, | |
| "id": 40, | |
| "interval": null, | |
| "links": [], | |
| "mappingType": 1, | |
| "mappingTypes": [ | |
| { | |
| "name": "value to text", | |
| "value": 1 | |
| }, | |
| { | |
| "name": "range to text", | |
| "value": 2 | |
| } | |
| ], | |
| "maxDataPoints": 100, | |
| "nullPointMode": "connected", | |
| "nullText": null, | |
| "postfix": "", | |
| "postfixFontSize": "50%", | |
| "prefix": "", | |
| "prefixFontSize": "50%", | |
| "rangeMaps": [ | |
| { | |
| "from": "null", | |
| "text": "N/A", | |
| "to": "null" | |
| } | |
| ], | |
| "sparkline": { | |
| "fillColor": "rgba(31, 118, 189, 0.18)", | |
| "full": false, | |
| "lineColor": "rgb(31, 120, 193)", | |
| "show": false | |
| }, | |
| "tableColumn": "", | |
| "targets": [ | |
| { | |
| "expr": "avg(DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"$hostname\"})", | |
| "format": "time_series", | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": "70,90", | |
| "title": "GPU 总内存利用率", | |
| "type": "singlestat", | |
| "valueFontSize": "80%", | |
| "valueMaps": [ | |
| { | |
| "op": "=", | |
| "text": "N/A", | |
| "value": "null" | |
| } | |
| ], | |
| "valueName": "current" | |
| }, | |
| { | |
| "cacheTimeout": null, | |
| "colorBackground": false, | |
| "colorValue": false, | |
| "colors": [ | |
| "#299c46", | |
| "rgba(237, 129, 40, 0.89)", | |
| "#d44a3a" | |
| ], | |
| "datasource": "Prometheus", | |
| "description": "", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {} | |
| }, | |
| "overrides": [] | |
| }, | |
| "format": "percent", | |
| "gauge": { | |
| "maxValue": 100, | |
| "minValue": 0, | |
| "show": true, | |
| "thresholdLabels": false, | |
| "thresholdMarkers": true | |
| }, | |
| "gridPos": { | |
| "h": 5, | |
| "w": 4, | |
| "x": 20, | |
| "y": 7 | |
| }, | |
| "id": 58, | |
| "interval": null, | |
| "links": [], | |
| "mappingType": 1, | |
| "mappingTypes": [ | |
| { | |
| "name": "value to text", | |
| "value": 1 | |
| }, | |
| { | |
| "name": "range to text", | |
| "value": 2 | |
| } | |
| ], | |
| "maxDataPoints": 100, | |
| "nullPointMode": "connected", | |
| "nullText": null, | |
| "postfix": "", | |
| "postfixFontSize": "50%", | |
| "prefix": "", | |
| "prefixFontSize": "50%", | |
| "rangeMaps": [ | |
| { | |
| "from": "null", | |
| "text": "N/A", | |
| "to": "null" | |
| } | |
| ], | |
| "sparkline": { | |
| "fillColor": "rgba(31, 118, 189, 0.18)", | |
| "full": false, | |
| "lineColor": "rgb(31, 120, 193)", | |
| "show": false | |
| }, | |
| "tableColumn": "", | |
| "targets": [ | |
| { | |
| "expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance=~\"$hostname\"})", | |
| "format": "time_series", | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": "80,90", | |
| "title": "GPU 总利用率", | |
| "type": "singlestat", | |
| "valueFontSize": "80%", | |
| "valueMaps": [ | |
| { | |
| "op": "=", | |
| "text": "N/A", | |
| "value": "null" | |
| } | |
| ], | |
| "valueName": "current" | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {}, | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 6, | |
| "w": 8, | |
| "x": 0, | |
| "y": 9 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 24, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "links": [], | |
| "nullPointMode": "connected", | |
| "percentage": false, | |
| "pluginVersion": "7.1.3", | |
| "pointradius": 5, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": true, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$hostname\"}", | |
| "format": "time_series", | |
| "hide": false, | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "{{instance}}.{{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeFrom": null, | |
| "timeRegions": [], | |
| "timeShift": null, | |
| "title": "GPU 功率", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "buckets": null, | |
| "mode": "time", | |
| "name": null, | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "watt", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| }, | |
| { | |
| "format": "watt", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false, | |
| "alignLevel": null | |
| } | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": "Prometheus", | |
| "description": "内存利用率\n", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {}, | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 0, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 10, | |
| "w": 8, | |
| "x": 8, | |
| "y": 11 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 39, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "links": [], | |
| "nullPointMode": "connected", | |
| "percentage": false, | |
| "pluginVersion": "7.1.3", | |
| "pointradius": 5, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"$hostname\"}", | |
| "format": "time_series", | |
| "hide": false, | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "{{instance}}.{{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeFrom": null, | |
| "timeRegions": [], | |
| "timeShift": null, | |
| "title": "GPU 内存利用率", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "buckets": null, | |
| "mode": "time", | |
| "name": null, | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "percent", | |
| "label": null, | |
| "logBase": 1, | |
| "max": "100", | |
| "min": "0", | |
| "show": true | |
| }, | |
| { | |
| "format": "watt", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false, | |
| "alignLevel": null | |
| } | |
| }, | |
| { | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {}, | |
| "links": [], | |
| "mappings": [], | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "#6ED0E0", | |
| "value": 25 | |
| }, | |
| { | |
| "color": "#EAB839", | |
| "value": 50 | |
| }, | |
| { | |
| "color": "red", | |
| "value": 75 | |
| } | |
| ] | |
| }, | |
| "unit": "percent" | |
| }, | |
| "overrides": [] | |
| }, | |
| "gridPos": { | |
| "h": 9, | |
| "w": 8, | |
| "x": 16, | |
| "y": 12 | |
| }, | |
| "id": 42, | |
| "links": [], | |
| "options": { | |
| "displayMode": "lcd", | |
| "orientation": "horizontal", | |
| "reduceOptions": { | |
| "calcs": [ | |
| "mean" | |
| ], | |
| "fields": "", | |
| "values": false | |
| }, | |
| "showUnfilled": true | |
| }, | |
| "pluginVersion": "7.1.3", | |
| "targets": [ | |
| { | |
| "expr": "(DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}/(DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}+DCGM_FI_DEV_FB_FREE{instance=~\"$hostname\"}))*100", | |
| "format": "time_series", | |
| "hide": false, | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "{{instance}}.{{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "timeFrom": null, | |
| "timeShift": null, | |
| "title": "GPU 内存使用率", | |
| "type": "bargauge" | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": "Prometheus", | |
| "fieldConfig": { | |
| "defaults": { | |
| "custom": {}, | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 0, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 6, | |
| "w": 8, | |
| "x": 0, | |
| "y": 15 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 25, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "links": [], | |
| "nullPointMode": "connected", | |
| "percentage": false, | |
| "pluginVersion": "7.1.3", | |
| "pointradius": 5, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$hostname\"} ", | |
| "format": "time_series", | |
| "hide": false, | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "{{instance}}.{{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeFrom": null, | |
| "timeRegions": [], | |
| "timeShift": null, | |
| "title": "GPU 温度", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "buckets": null, | |
| "mode": "time", | |
| "name": null, | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "celsius", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "label": null, | |
| "logBase": 1, | |
| "max": null, | |
| "min": null, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false, | |
| "alignLevel": null | |
| } | |
| } | |
| ], | |
| "refresh": "5s", | |
| "schemaVersion": 26, | |
| "style": "dark", | |
| "tags": [ | |
| "GPU" | |
| ], | |
| "templating": { | |
| "list": [ | |
| { | |
| "allValue": null, | |
| "current": { | |
| "selected": true, | |
| "tags": [], | |
| "text": "lab-81 + lab-80 + lab-73 + lab-72 + lab-71 + lab-70 + lab-61 + lab-60", | |
| "value": [ | |
| "lab-81", | |
| "lab-80", | |
| "lab-73", | |
| "lab-72", | |
| "lab-71", | |
| "lab-70", | |
| "lab-61", | |
| "lab-60" | |
| ] | |
| }, | |
| "datasource": "Prometheus", | |
| "definition": "label_values(instance)", | |
| "hide": 0, | |
| "includeAll": false, | |
| "label": "host", | |
| "multi": true, | |
| "name": "hostname", | |
| "options": [], | |
| "query": "label_values(instance)", | |
| "refresh": 1, | |
| "regex": "", | |
| "skipUrlSync": false, | |
| "sort": 6, | |
| "tagValuesQuery": "", | |
| "tags": [], | |
| "tagsQuery": "", | |
| "type": "query", | |
| "useTags": false | |
| } | |
| ] | |
| }, | |
| "time": { | |
| "from": "now-30m", | |
| "to": "now" | |
| }, | |
| "timepicker": { | |
| "refresh_intervals": [ | |
| "10s", | |
| "30s", | |
| "1m", | |
| "5m", | |
| "15m", | |
| "30m", | |
| "1h", | |
| "2h", | |
| "1d" | |
| ], | |
| "time_options": [ | |
| "5m", | |
| "15m", | |
| "1h", | |
| "6h", | |
| "12h", | |
| "24h", | |
| "2d", | |
| "7d", | |
| "30d" | |
| ] | |
| }, | |
| "timezone": "browser", | |
| "title": "GPU-Nodes-Metrics-Nvidia", | |
| "uid": "hpcsyl6zhqk", | |
| "version": 6 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment