Created
September 7, 2019 14:14
-
-
Save koyo922/db8548e15484675965c5aef99056d3ee to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# vim: tabstop=4 shiftwidth=4 expandtab number | |
""" | |
每15钟检测一次 如果发现开机太久且GPU空闲,就告警 | |
有个免费的运维告警平台,参见 https://caweb.aiops.com/#/integrate 下面的REST API集成 | |
弄完后添加开机自启动 | |
chmod 755 alert_gpu_idle.py | |
sudo vim /etc/rc.local # 将此脚本的绝对路径写进去 | |
""" | |
from __future__ import unicode_literals | |
import subprocess | |
import sys | |
import time | |
import json | |
import logging | |
try: | |
import requests | |
except ImportError: | |
subprocess.call(['pip', 'install', 'requests']) | |
import requests | |
num_gpus = 2 if len(sys.argv) < 2 else int(sys.argv[1]) # 默认有两块显卡 | |
while True: | |
nv_output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']) | |
avg_gpu_usage = sum(map(float, nv_output.splitlines())) / (100.0 * num_gpus) | |
up_seconds = float(open('/proc/uptime', 'rt').readline().split()[0]) | |
if avg_gpu_usage < 0.7 and up_seconds > 2 * 3600: | |
response = requests.post('http://api.aiops.com/alert/api/event', headers={'Content-type': 'application/json'}, | |
data=json.dumps({ | |
"app": "7e79d4b9-43c7-你在睿智云上创建的appKey", "eventId": "0", | |
"eventType": "trigger", | |
"priority": 1, | |
"alarmName": "AWS上的GPU闲着啦", | |
"alarmContent": {"k1": "开机超过2小时且每块GPU平均使用率低于70%", "k2": 0.00}, | |
}, ensure_ascii=True)) # 注意要json.dumps | |
logging.warning('[ALERT] sent') | |
assert response.ok | |
else: | |
logging.warning('[SAFE] avg_gpu_usage=%.2f up_seconds=%d seconds', avg_gpu_usage, up_seconds) | |
time.sleep(15 * 60) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment