Skip to content

Instantly share code, notes, and snippets.

@zhuchen115
Created June 11, 2019 23:51
Show Gist options
  • Save zhuchen115/0a6afab1d2d117a9b7fa7d681a9081a6 to your computer and use it in GitHub Desktop.
Save zhuchen115/0a6afab1d2d117a9b7fa7d681a9081a6 to your computer and use it in GitHub Desktop.
A GPU training monitor, to prevent the process stop respond.
import os
import subprocess
import time
import psutil
nvtask = {}
while True:
nvrescmd = subprocess.run(['nvidia-smi','pmon','-c','1'], stdout=subprocess.PIPE)
nvresout = nvrescmd.stdout.decode('utf-8')
nvprocarr = nvresout.split('\n')[2:]
for nvproc in nvprocarr:
nvprocinfo = nvproc.split()
if len(nvprocinfo) <2:
continue
if nvprocinfo[2] == 'G' or nvprocinfo[2] == '-':
continue
#print('gpu:{}, pid:{}, util:{}'.format(nvprocinfo[0],nvprocinfo[1],nvprocinfo[3]))
if int(nvprocinfo[3]) == 0 :
if int(nvprocinfo[1]) in nvtask:
nvtask[int(nvprocinfo[1])] += 1
else:
nvtask[int(nvprocinfo[1])] = 1
if nvtask[int(nvprocinfo[1])] > 10:
from termcolor import colored
print('Process {} counted for {}'.format(nvprocinfo[1],nvtask[int(nvprocinfo[1])]))
else:
nvtask[int(nvprocinfo[1])] = 0
for pid, zerocount in nvtask.copy().items():
if not psutil.pid_exists(pid):
del nvtask[pid]
else:
if zerocount > 300:
os.kill(pid,9)
os.system('nvidia-smi')
os.system('sensors')
time.sleep(1)
os.system('clear')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment