Created
June 11, 2019 23:51
-
-
Save zhuchen115/0a6afab1d2d117a9b7fa7d681a9081a6 to your computer and use it in GitHub Desktop.
A GPU training monitor, to prevent the process stop respond.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
import time | |
import psutil | |
nvtask = {} | |
while True: | |
nvrescmd = subprocess.run(['nvidia-smi','pmon','-c','1'], stdout=subprocess.PIPE) | |
nvresout = nvrescmd.stdout.decode('utf-8') | |
nvprocarr = nvresout.split('\n')[2:] | |
for nvproc in nvprocarr: | |
nvprocinfo = nvproc.split() | |
if len(nvprocinfo) <2: | |
continue | |
if nvprocinfo[2] == 'G' or nvprocinfo[2] == '-': | |
continue | |
#print('gpu:{}, pid:{}, util:{}'.format(nvprocinfo[0],nvprocinfo[1],nvprocinfo[3])) | |
if int(nvprocinfo[3]) == 0 : | |
if int(nvprocinfo[1]) in nvtask: | |
nvtask[int(nvprocinfo[1])] += 1 | |
else: | |
nvtask[int(nvprocinfo[1])] = 1 | |
if nvtask[int(nvprocinfo[1])] > 10: | |
from termcolor import colored | |
print('Process {} counted for {}'.format(nvprocinfo[1],nvtask[int(nvprocinfo[1])])) | |
else: | |
nvtask[int(nvprocinfo[1])] = 0 | |
for pid, zerocount in nvtask.copy().items(): | |
if not psutil.pid_exists(pid): | |
del nvtask[pid] | |
else: | |
if zerocount > 300: | |
os.kill(pid,9) | |
os.system('nvidia-smi') | |
os.system('sensors') | |
time.sleep(1) | |
os.system('clear') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment