Skip to content

Instantly share code, notes, and snippets.

@mmterpstra
Last active April 28, 2021 19:41
Show Gist options
  • Save mmterpstra/bdd1b856550b672d87001c60284b5f49 to your computer and use it in GitHub Desktop.
Save mmterpstra/bdd1b856550b672d87001c60284b5f49 to your computer and use it in GitHub Desktop.
Molgenis compute 5 script for finding failed jobs.
#!/usr/bin/env python3
import sys, os, time, subprocess
#use findnotfinished.py /path/to/jobsdir
try:
input_dir = sys.argv[1];
except:
print('Die cannot open dir')
exit
started_jobs = []
print("# Getting dir listing...(might take a while...)")
#get the started jobs when no finished is present...this sound iffy and it might be.
for filename in os.listdir(input_dir):
f = os.path.join(input_dir, filename)
# checking if it is a file
#check if started/finished is present...finished????
if os.path.isfile(f) and f.find(".sh.started") != -1 and not os.path.isfile(f.replace(".started",".finished")):
started_jobs.append(f)
#os.system("sacct --format=Jobid,jobname,user,State,Elapsed,node,Submit -s r -p -u $USER");
#subprocess.run(["ls", "-l", "/dev/null"])
running_jobs = []
print("# Getting running slurm jobs...")
queue = subprocess.run(["sacct","--format=Jobid,jobname,user,State,Elapsed,node,Submit","-s","r","-p"],stdout=subprocess.PIPE,)
for line in queue.stdout.decode('UTF-8').splitlines():
#print(line)
psv= line.split("|")
jobnamefull = psv[1];
#jobid = jobnamefull.split("_")[-1]
jobbase = jobnamefull.split("_")[-2:]
#print(jobbase)
#print(jobname)
if not jobnamefull == "batch":
running_jobs.append("_".join(jobbase))
failed_jobs = {}
print("# Removing running slurm jobs from started list...")
# to get your failed jobs
for job in started_jobs:
running = False
#print(job)
for jobsruns in running_jobs:
if job.find(jobsruns+".sh.started") != -1:
running=True
if not running:
job =job.strip(".sh.started");
#print("failed jobs erorr in: " + job.replace(".sh.started",".err"))
#failed_jobs.append(job);
jobid = job.split("_")[-1]
jobbase = "_".join(job.split("_")[:-1])
if jobbase in failed_jobs:
failed_jobs[jobbase].append(jobid);
else:
failed_jobs[jobbase]=[jobid]
print("\n# Here a prettyish list of all the failed jobs.\n")
for jobbase in failed_jobs.keys():
if len(failed_jobs[jobbase]) > 1:
print(" - failed jobs " + jobbase + "_{" + ",".join(sorted(failed_jobs[jobbase]))+"}.err");
else:
#this removes the curly bracktes and the comma for ez copypasting
print(" - failed job " + jobbase + "_" + ",".join(sorted(failed_jobs[jobbase]))+".err");
print("\t\\_consider checking " + jobbase + "_" + sorted(failed_jobs[jobbase])[0] + ".err");
@mmterpstra
Copy link
Author

I hope this won't be used anymore, by writing this script:

function findnotfinished {
        (set -e
        for dir in $@ ; do
                for started in $dir/*.started; do
                        if [ ! -e $dir/$(basename $started .started).finished ] ; then
                                echo $dir/$(basename $started .sh.started).err
                        fi
                done
        done) | column -t
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment