Last active
April 28, 2021 19:41
-
-
Save mmterpstra/bdd1b856550b672d87001c60284b5f49 to your computer and use it in GitHub Desktop.
Molgenis compute 5 script for finding failed jobs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys, os, time, subprocess | |
#use findnotfinished.py /path/to/jobsdir | |
try: | |
input_dir = sys.argv[1]; | |
except: | |
print('Die cannot open dir') | |
exit | |
started_jobs = [] | |
print("# Getting dir listing...(might take a while...)") | |
#get the started jobs when no finished is present...this sound iffy and it might be. | |
for filename in os.listdir(input_dir): | |
f = os.path.join(input_dir, filename) | |
# checking if it is a file | |
#check if started/finished is present...finished???? | |
if os.path.isfile(f) and f.find(".sh.started") != -1 and not os.path.isfile(f.replace(".started",".finished")): | |
started_jobs.append(f) | |
#os.system("sacct --format=Jobid,jobname,user,State,Elapsed,node,Submit -s r -p -u $USER"); | |
#subprocess.run(["ls", "-l", "/dev/null"]) | |
running_jobs = [] | |
print("# Getting running slurm jobs...") | |
queue = subprocess.run(["sacct","--format=Jobid,jobname,user,State,Elapsed,node,Submit","-s","r","-p"],stdout=subprocess.PIPE,) | |
for line in queue.stdout.decode('UTF-8').splitlines(): | |
#print(line) | |
psv= line.split("|") | |
jobnamefull = psv[1]; | |
#jobid = jobnamefull.split("_")[-1] | |
jobbase = jobnamefull.split("_")[-2:] | |
#print(jobbase) | |
#print(jobname) | |
if not jobnamefull == "batch": | |
running_jobs.append("_".join(jobbase)) | |
failed_jobs = {} | |
print("# Removing running slurm jobs from started list...") | |
# to get your failed jobs | |
for job in started_jobs: | |
running = False | |
#print(job) | |
for jobsruns in running_jobs: | |
if job.find(jobsruns+".sh.started") != -1: | |
running=True | |
if not running: | |
job =job.strip(".sh.started"); | |
#print("failed jobs erorr in: " + job.replace(".sh.started",".err")) | |
#failed_jobs.append(job); | |
jobid = job.split("_")[-1] | |
jobbase = "_".join(job.split("_")[:-1]) | |
if jobbase in failed_jobs: | |
failed_jobs[jobbase].append(jobid); | |
else: | |
failed_jobs[jobbase]=[jobid] | |
print("\n# Here a prettyish list of all the failed jobs.\n") | |
for jobbase in failed_jobs.keys(): | |
if len(failed_jobs[jobbase]) > 1: | |
print(" - failed jobs " + jobbase + "_{" + ",".join(sorted(failed_jobs[jobbase]))+"}.err"); | |
else: | |
#this removes the curly bracktes and the comma for ez copypasting | |
print(" - failed job " + jobbase + "_" + ",".join(sorted(failed_jobs[jobbase]))+".err"); | |
print("\t\\_consider checking " + jobbase + "_" + sorted(failed_jobs[jobbase])[0] + ".err"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I hope this won't be used anymore, by writing this script: