Created
March 4, 2019 04:47
-
-
Save zomux/494f616c84df61cf8586dd4c58784de9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import sys, os | |
import time | |
import random | |
import getpass | |
import subprocess | |
if not os.path.exists("/tmp/waitgpu"): | |
os.mkdir("/tmp/waitgpu") | |
def is_gpu_locked(): | |
if not os.path.exists("/tmp/waitgpu/lock"): | |
open("/tmp/waitgpu/lock", "w").close() | |
return time.time() - os.path.getmtime("/tmp/waitgpu/lock") < 60 | |
def lock_gpu(): | |
open("/tmp/waitgpu/lock", "w").close() | |
def check_first_task(): | |
fnames = os.listdir("/tmp/waitgpu") | |
tasks = [] | |
for fname in fnames: | |
fpath = "/tmp/waitgpu/{}".format(fname) | |
pairs = open(fpath).read().strip().split("\t") | |
if len(pairs) != 5: | |
continue | |
id, user, create_time, heartbeat_time, command = pairs | |
if time.time() - float(heartbeat_time) > 30: | |
# Validate the task | |
try: | |
os.remove(fpath) | |
except: | |
pass | |
else: | |
# Add to list | |
tasks.append((float(create_time), int(id), command)) | |
tasks.sort() | |
if not tasks: | |
return None | |
else: | |
return tasks[0][1] | |
def is_gpu_available(): | |
ret = subprocess.check_output("nvidia-smi", shell=True) | |
return "No running processes found" in str(ret) | |
def print_all_guys(): | |
fnames = os.listdir("/tmp/waitgpu") | |
tasks = [] | |
for fname in fnames: | |
fpath = "/tmp/waitgpu/{}".format(fname) | |
pairs = open(fpath).read().strip().split("\t") | |
if len(pairs) != 5: | |
continue | |
id, user, create_time, heartbeat_time, command = pairs | |
if time.time() - float(heartbeat_time) < 30: | |
tasks.append((float(create_time), user, command)) | |
tasks.sort() | |
c = 1 | |
print("-------") | |
if is_gpu_available(): | |
print("Status: GPU is not being used now") | |
else: | |
print("Status: GPU is occupied now") | |
print("-------") | |
print("Wating list:") | |
print("rank\tuser\tcommand") | |
for ct, user, command in tasks: | |
print("{}\t{}\t{}".format(c, user, command)) | |
c += 1 | |
print("-------") | |
if __name__ == '__main__': | |
command = " ".join(sys.argv[1:]) | |
if command == "list": | |
print_all_guys() | |
elif command.strip() == "": | |
print("Usage:") | |
print("1. waitgpu python xxx.py <- run a command") | |
print("2. waitgpu list <- see the waiting list") | |
else: | |
print("[waitgpu] waiting:", command) | |
myid = random.randint(0, 999999) | |
myhandle = "/tmp/waitgpu/{}.event".format(myid) | |
myuser = getpass.getuser() | |
create_time = time.time() | |
while True: | |
# Write event file | |
with open(myhandle, "w") as fhandle: | |
fhandle.write("\t".join([ | |
str(myid), myuser, str(create_time), | |
str(time.time()), command | |
])) | |
# Checking the gpus status, and run | |
if is_gpu_available() and not is_gpu_locked() and check_first_task() == myid: | |
print("[waitgpu] execute:", command) | |
os.remove(myhandle) | |
lock_gpu() | |
os.system(command) | |
sys.exit() | |
break | |
else: | |
time.sleep(10) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment