Skip to content

Instantly share code, notes, and snippets.

@zomux
Created March 4, 2019 04:47
Show Gist options
  • Save zomux/494f616c84df61cf8586dd4c58784de9 to your computer and use it in GitHub Desktop.
Save zomux/494f616c84df61cf8586dd4c58784de9 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3
import sys, os
import time
import random
import getpass
import subprocess
if not os.path.exists("/tmp/waitgpu"):
os.mkdir("/tmp/waitgpu")
def is_gpu_locked():
if not os.path.exists("/tmp/waitgpu/lock"):
open("/tmp/waitgpu/lock", "w").close()
return time.time() - os.path.getmtime("/tmp/waitgpu/lock") < 60
def lock_gpu():
open("/tmp/waitgpu/lock", "w").close()
def check_first_task():
fnames = os.listdir("/tmp/waitgpu")
tasks = []
for fname in fnames:
fpath = "/tmp/waitgpu/{}".format(fname)
pairs = open(fpath).read().strip().split("\t")
if len(pairs) != 5:
continue
id, user, create_time, heartbeat_time, command = pairs
if time.time() - float(heartbeat_time) > 30:
# Validate the task
try:
os.remove(fpath)
except:
pass
else:
# Add to list
tasks.append((float(create_time), int(id), command))
tasks.sort()
if not tasks:
return None
else:
return tasks[0][1]
def is_gpu_available():
ret = subprocess.check_output("nvidia-smi", shell=True)
return "No running processes found" in str(ret)
def print_all_guys():
fnames = os.listdir("/tmp/waitgpu")
tasks = []
for fname in fnames:
fpath = "/tmp/waitgpu/{}".format(fname)
pairs = open(fpath).read().strip().split("\t")
if len(pairs) != 5:
continue
id, user, create_time, heartbeat_time, command = pairs
if time.time() - float(heartbeat_time) < 30:
tasks.append((float(create_time), user, command))
tasks.sort()
c = 1
print("-------")
if is_gpu_available():
print("Status: GPU is not being used now")
else:
print("Status: GPU is occupied now")
print("-------")
print("Wating list:")
print("rank\tuser\tcommand")
for ct, user, command in tasks:
print("{}\t{}\t{}".format(c, user, command))
c += 1
print("-------")
if __name__ == '__main__':
command = " ".join(sys.argv[1:])
if command == "list":
print_all_guys()
elif command.strip() == "":
print("Usage:")
print("1. waitgpu python xxx.py <- run a command")
print("2. waitgpu list <- see the waiting list")
else:
print("[waitgpu] waiting:", command)
myid = random.randint(0, 999999)
myhandle = "/tmp/waitgpu/{}.event".format(myid)
myuser = getpass.getuser()
create_time = time.time()
while True:
# Write event file
with open(myhandle, "w") as fhandle:
fhandle.write("\t".join([
str(myid), myuser, str(create_time),
str(time.time()), command
]))
# Checking the gpus status, and run
if is_gpu_available() and not is_gpu_locked() and check_first_task() == myid:
print("[waitgpu] execute:", command)
os.remove(myhandle)
lock_gpu()
os.system(command)
sys.exit()
break
else:
time.sleep(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment