Last active
December 22, 2015 00:39
-
-
Save armanbilge/6390764 to your computer and use it in GitHub Desktop.
Outsmarts SLURM by automatically increasing the memory allocation for a failing job.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
# outsmart-slurm.bash | |
# Outsmarts SLURM by automatically increasing the memory allocation for a failing job. | |
# Usage: bash outsmart-slurm.bash [SBATCH file] | |
# Download the latest revision at https://gist.github.com/armanbilge/6390764 | |
if [ -z "$1" ] | |
then | |
echo "*** ERROR *** No SBATCH file specified." 1>&2 | |
echo "Usage: bash outsmart-slurm.bash [SBATCH file]" 1>&2 | |
exit 1 | |
fi | |
uuid=$(uuidgen) | |
uuid=${uuid:0:8} | |
touch outsmart-$uuid.log | |
while true | |
do | |
jobid=$(sbatch $1 | grep -oE "[0-9]+") | |
if [[ $? -ne 0 ]] | |
then | |
echo "*** ERROR *** Something wrong with your input SBATCH. Check slurm-$jobid.out and your error log for more details." >> outsmart-$uuid.log | |
exit 1 | |
fi | |
echo "Launched new SLURM job with id $jobid." >> outsmart-$uuid.log | |
while squeue | grep -q $jobid; do :; done | |
grep -q "CANCELLED" "slurm-$jobid.out" | |
a=$? | |
grep -q "memory limit" "slurm-$jobid.out" | |
b=$? | |
if [[ $a && $b ]] | |
then | |
rm slurm-$jobid.out | |
mem=$(grep "\-\-mem" $1 | grep -oE "[0-9]+") | |
newmem=$((2*mem)) | |
sed -i "s/--mem=$mem/--mem=$newmem/g" $1 | |
echo "Increased memory from $mem MB to $newmem MB." >> outsmart-$uuid.log | |
else | |
echo "*** JOB COMPLETED *** (or stopped from unknown error)" >> outsmart-$uuid.log | |
exit | |
fi | |
done & | |
echo "Outsmart-SLURM launched. Check outsmart-$uuid.log for progress." | |
exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment