Created
April 1, 2021 19:13
-
-
Save zhw12/16616cf69e63cbfe38c37b854f2ea2f5 to your computer and use it in GitHub Desktop.
GPU info logging and scheduling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""GPU info logging and scheduling""" | |
import gpustat | |
import time | |
import argparse | |
import json | |
import logging | |
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) | |
def log(params): | |
"""a background job writing gpustat to a logging file""" | |
steps = 0 | |
gpu_infos = [] | |
# write log to tmp file | |
while True: | |
res = gpustat.new_query().jsonify() | |
logging_record = {'query_time': str(res['query_time']), | |
'gpus':res['gpus']} | |
with open(params.logging_file, 'a') as fout: | |
fout.write(json.dumps(logging_record)+'\n') # each line is a subset of gpustat | |
if steps >= params.max_records: | |
with open(params.logging_file, 'r') as fin: | |
for l in fin: | |
gpu_infos.append(json.loads(l)) | |
gpu_infos = gpu_infos[-params.min_records:] | |
with open(params.logging_file, 'w') as fout: | |
for l in gpu_infos: | |
fout.write(json.dumps(l)+'\n') | |
steps = params.min_records - 1 | |
steps += 1 | |
logging.info(str(logging_record)) | |
time.sleep(params.logging_every) | |
def do_process(): | |
return | |
def check_availability(params): | |
""" check availability based on some criteria """ | |
time_gaps = [1, 2, 5, 10, 30] # minute, checking at x minute after starting | |
num_try = 0 | |
while num_try < params.max_tries: | |
res = gpustat.new_query().jsonify() | |
for gpu_info in res['gpus']: | |
# write your available criteria here | |
if gpu_info['memory.used'] < 1000: # M | |
num_try = float('inf') | |
do_process() # fork a process and exit checking program | |
break | |
time_gap = time_gaps[num_try] if num_try < len(time_gaps) else time_gaps[-1] | |
time.sleep(time_gap) | |
if num_try < len(time_gaps) - 1: # repeat at 30 mins | |
num_try += 1 | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--logging_every", type=int, default=60, | |
help="logging every x seconds") | |
parser.add_argument("--logging_file", type=str, default="/tmp/gpuinfo_log.jsonl", | |
help="logging file") | |
parser.add_argument("--max_records", type=int, default=1000, | |
help="max records") | |
parser.add_argument("--min_records", type=int, default=500, | |
help="truncate records to x lines") | |
parser.add_argument('--check', default=False, action='store_true', | |
help='check availability') | |
parser.add_argument('--log', default=False, action='store_true', | |
help='check availability') | |
params = parser.parse_args() | |
if params.log: | |
log(params) | |
elif params.check: | |
check_availability(params) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment