Created
May 3, 2024 22:07
-
-
Save fanurs/3888e74f7cefdc7597f5bc85813ff36f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
TIMEOUT=60 # in seconds | |
TIMESTEP=3 # in seconds | |
JOB_NAME="tunnel" # any unique name | |
SCRIPT="/path/to/interactive.sbatch" # modify accordingly | |
# It starts an OpenSSH server to enable interactive sessions, allowing IDEs like VSCode to connect remotely. | |
# Here is an example script: https://crc-pages.pitt.edu/user-manual/slurm/vscode/#steps-performed-only-once | |
main () { | |
# parse arguments | |
RESTART_MODE=0 # default | |
if [ "$#" -eq 0 ]; then | |
: | |
elif [[ "$1" == "--restart" || "$1" == "-r" ]]; then | |
RESTART_MODE=1 | |
elif [[ "$1" == "--help" || "$1" == "-h" ]]; then | |
echo_usage | |
exit 0 | |
else | |
echo "Error: Invalid argument." | |
echo_usage | |
exit 1 | |
fi | |
# start by querying all jobs named $JOB_NAME | |
declare -a job_id | |
declare -a job_state | |
i=0 | |
while read -r _id _state; do | |
job_id[$i]=$_id | |
job_state[$i]=$_state | |
i=$((i + 1)) | |
done < <(sq --Format='JobID,State' --sort='TimeLeft') | |
n_jobs=${#job_id[@]} | |
# if --restart flag is set | |
if [ "$RESTART_MODE" -eq 1 ]; then | |
if ! ([ "$n_jobs" -eq 1 ] && [ "${job_state[0]}" == "RUNNING" ]); then | |
echo "Error: --restart flag requires exactly one job and it must be RUNNING." | |
exit 1 | |
fi | |
echo "Restarting by submitting a new job." | |
sbatch $SCRIPT &> /dev/null | |
echo "Submitted the new job." | |
echo "Waiting for the new job to start." | |
wait_till_timeout \ | |
'[ "$(sq --states=RUNNING | grep -c .)" -eq 2 ]' \ | |
"Timeout reached. Job is not started." | |
echo "New job is running." | |
# kill the old job | |
echo "Will be cancelling the old job (this terminal instance)." | |
echo "You may need to reload this terminal instance to get into the new job." | |
scancel ${job_id[0]} &> /dev/null | |
return | |
fi | |
# if there is exactly one job (regardless of its state) | |
if [ "$n_jobs" -eq 1 ]; then | |
# if the job is neither running nor pending, cancel it | |
if [ "${job_state[0]}" != "RUNNING" ] && [ "${job_state[0]}" != "PENDING" ]; then | |
scancel ${job_id[0]} &> /dev/null | |
wait_till_timeout \ | |
'[ "$(sq | grep -c .)" -eq 0 ]' \ | |
"Timeout reached. Failed to cancel the job." | |
fi | |
# wait until the job is running | |
wait_till_timeout \ | |
'[ "$(sq --states=RUNNING | grep -c .)" -eq 1 ]' \ | |
"Timeout reached. Job is not started." | |
echo_nc | |
# early termination as the job is already running | |
return | |
fi | |
# we either have no job or more than one job from here on | |
# if there are more than one job, cancel all of them | |
if [ "$n_jobs" -ge 2 ]; then | |
scancel --user=$USER --name=$JOB_NAME &> /dev/null | |
wait_till_timeout \ | |
'[ "$(sq | grep -c .)" -eq 0 ]' \ | |
"Timeout reached.. Failed to cancel all jobs." | |
fi | |
# submit a new job | |
sbatch $SCRIPT &> /dev/null | |
wait_till_timeout \ | |
'[ "$(sq --states=RUNNING | grep -c .)" -eq 1 ]' \ | |
"Timeout reached. Job is not started." | |
echo_nc | |
} | |
echo_usage() { | |
echo "Usage: $0 [OPTIONS]" | |
echo "Options:" | |
echo " -r, --restart Resubmit a new job. Requires exactly one job named '$JOB_NAME' to be currently active." | |
echo " -h, --help Display this help message and exit." | |
echo "" | |
echo "Description:" | |
echo " Here, the term 'job' always refers to a SLURM job with the name '$JOB_NAME' under the current user only." | |
echo " It ensures that there is a single active running job of this name." | |
echo " Here's how it handles different scenarios:" | |
echo " - If no jobs are found, it submits a new job." | |
echo " - If exactly one job is found, it waits until this job is running." | |
echo " - If more than one job is found, it cancels all such jobs and submits a new one." | |
} | |
# shorthand for squeue that we care about in this script | |
sq() { | |
squeue --user=$USER --name=$JOB_NAME --local --noheader "$@" | |
} | |
# wait until the condition is met or timeout is reached | |
wait_till_timeout() { | |
local wait_condition=$1 | |
local timeout_message=$2 | |
elapsed=0 | |
while ! eval "$wait_condition"; do | |
sleep $TIMESTEP | |
elapsed=$((elapsed + $TIMESTEP)) | |
if [ "$elapsed" -ge "$TIMEOUT" ]; then | |
echo "$timeout_message" | |
exit 1 | |
fi | |
done | |
} | |
# echo the NodeList (node) and Comment (port) of the running job with the most time left | |
echo_nc() { | |
local elapsed=0 | |
while ! read -r node port < <(sq --states=RUNNING --sort='TimeLeft' --Format='NodeList,Comment' | tail -n 1); do | |
if [ -n "$node" ] && [[ "$port" =~ ^[0-9]+$ ]]; then | |
break | |
fi | |
elapsed=$((elapsed + $TIMESTEP)) | |
if [ "$elapsed" -ge "$TIMEOUT" ]; then | |
echo "Timeout reached. Failed to get valid node and port." | |
exit 1 | |
fi | |
done | |
echo $node $port | |
} | |
main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment