-
-
Save TheProjectsGuy/de328d8c6f9dd46a4785bb299575bc47 to your computer and use it in GitHub Desktop.
#!/bin/bash | |
# Copyright (C) 2022 Avneesh Mishra - GNU GPLv3 | |
# | |
# This program is free software: you can redistribute it and/or modify it | |
# under the terms of the GNU General Public License as published by the | |
# Free Software Foundation, either version 3 of the License, or (at your | |
# option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, but | |
# WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General | |
# Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License along | |
# with this program. If not, see <http://www.gnu.org/licenses/>. | |
# Obtain an allocation using `salloc` and run an interactive session on it | |
# using `srun`. Check usage using [-h|--help]. | |
readonly VERSION_MAJOR=1 | |
readonly VERSION_MINOR=14 | |
readonly LAST_MDATE="Tuesday 06 June 2023 12:30:45 PM IST" # Output of `date` | |
VERSION="$VERSION_MAJOR.$VERSION_MINOR" | |
readonly ARGS="$@" # Reset using https://stackoverflow.com/a/4827707 | |
readonly PROGNAME=$(basename $0) | |
readonly PROGPATH=$(realpath $(dirname $0)) | |
# URL on GitHub Gist (for latest version) | |
readonly PROG_URL="https://raw.githubusercontent.com/TheProjectsGuy/DotFiles/main/shellscripts/csinteractive.sh" | |
# Defaults | |
readonly DEF_NUM_CPUS=2 | |
readonly DEF_PARTITION=long | |
readonly DEF_ACCOUNT=research | |
readonly DEF_JOB_NAME=interactive | |
readonly DEF_NUM_GPUS=0 | |
readonly DEF_NUM_NODES=1 | |
readonly DEF_TIME_PERIOD="6:00:00" | |
readonly DEF_USER_SHELL=bash | |
readonly COMNMS="gnode" # A phrase all nodes share (for grep) | |
# ---- Output formats ---- | |
# Rules: | |
# - Debug: No problems, ignore this output | |
# - Info: Information for user | |
# - Warn: Something unusual, but can be handled by this script | |
# - Fatal: Some problem lead to failure (check the exit code) | |
# | |
# Check out: https://en.wikipedia.org/wiki/ANSI_escape_code#Colors | |
# Declare text formats for printing different log messages | |
debug_msg_fmt="\e[2;90m" | |
info_msg_fmt="\e[1;37m" | |
warn_msg_fmt="\e[1;35m" | |
fatal_msg_fmt="\e[2;31m" | |
command_msg_fmt="\e[0;36m" | |
# Wrapper functions | |
echo_debug () { | |
echo -ne $debug_msg_fmt | |
echo $@ | |
echo -ne "\e[0m" | |
} | |
echo_info () { | |
echo -ne $info_msg_fmt | |
echo $@ | |
echo -ne "\e[0m" | |
} | |
echo_warn () { | |
echo -ne $warn_msg_fmt | |
echo $@ | |
echo -ne "\e[0m" | |
} | |
echo_fatal () { | |
echo -ne $fatal_msg_fmt | |
echo $@ | |
echo -ne "\e[0m" | |
} | |
echo_command () { | |
echo -ne $command_msg_fmt | |
echo $@ | |
echo -ne "\e[0m" | |
} | |
# ==== Script variables ==== | |
# Binaries | |
readonly SALLOC_BIN=$(which salloc 2> /dev/null) | |
if [[ -z $SALLOC_BIN ]]; then | |
echo_fatal "Could not find salloc" | |
exit 127 | |
fi | |
readonly SRUN_BIN=$(which srun 2> /dev/null) | |
if [[ -z $SRUN_BIN ]]; then | |
echo_fatal "Could not find srun" | |
exit 127 | |
fi | |
SHELL_PATH="$(which $DEF_USER_SHELL 2> /dev/null)" | |
if [[ -z $SHELL_PATH ]]; then | |
echo_fatal "Could not find shell: $DEF_USER_SHELL" | |
exit 127 | |
fi | |
# Local binaries (no need for checking) | |
readonly sacct_bin=$(which sacct 2> /dev/null) | |
echo_debug "Found $SALLOC_BIN and $SRUN_BIN" | |
echo_debug "Using shell $SHELL_PATH" | |
# Parameters for job allocation | |
NUM_CPUS=$DEF_NUM_CPUS # Number of CPUs | |
NUM_GPUS=$DEF_NUM_GPUS # Number of GPUs | |
NUM_NODES=$DEF_NUM_NODES # Number of Nodes on HPC | |
TIME_PERIOD=$DEF_TIME_PERIOD # Time for the script | |
JOB_NAME=$DEF_JOB_NAME # Job name | |
PARTITION=$DEF_PARTITION # Partition | |
ACCOUNT=$DEF_ACCOUNT # Account (for script) | |
# Options for the scripts | |
SALLOC_OPTS="" # Will be set in create_salloc_opts | |
SRUN_OPTS="" # Will be set in create_srun_opts | |
SRUN_EXEC="" # Will be set in create_srun_opts | |
EXEC_SCRIPT=true # By default, execute the script (no dry run) | |
# ==== Utility functions ==== | |
function usage () { | |
cat <<-EOF | |
Cool sinteractive (SLURM interactive) script, version $VERSION | |
Usage: $PROGNAME [-OPTARG VAL ...] | |
All optional arguments: | |
-a | --account Account name (can also come after -r [1]) | |
(Default: $DEF_ACCOUNT) | |
-b | --begin Give a begin time for the job allocation. | |
Eg: "16:00", "now+1hour", "now+60" (sec), | |
"2010-01-20T12:34:00" | |
-c | --cpu Number of CPUs to request | |
(Default: $DEF_NUM_CPUS) | |
-g | --gpu Number of GPUs to request | |
(Default: $DEF_NUM_GPUS) | |
-h | --help Help menu (this menu) | |
-i | --node-info Show information about a single node or a comma | |
separated list of nodes and exit | |
-j | --job-info Job information for a job number | |
-J | --name Job name (no spaces) | |
(Default: $DEF_JOB_NAME) | |
-K | --kill-all Cancels all running SLURM jobs of user. This | |
should be the only argument passed. | |
-l | --last-node Use the gnode which the user used previously | |
Command uses 'sacct' and 'USER' to fetch data | |
User must have some allocation in last 7 days | |
-m | --mem-per-cpu Minimum RAM per CPU. Check configurations using | |
'scontrol show config' and 'seff' | |
-M | --queue-me Print the user 'squeue' information (running | |
jobs) (User name: $USER) | |
-n | --dry-run Do not get allocation and run; only show output | |
-N | --nodes Number of nodes for 'salloc' allocation | |
(Default: $DEF_NUM_NODES) | |
-p | --partition Partition for node allocation | |
(Default: $DEF_PARTITION) | |
-r | --reservation Reservation (account is also set to same [1]) | |
-s | --shell Shell to use (bash, zsh) | |
-t | --time Time limit for the job allocation (0 = inf) | |
(Default: $DEF_TIME_PERIOD) | |
-u | --update Update the csinteractive script | |
Directory: $PROGPATH | |
Program: $PROGNAME | |
-v | --version Show the version information and exit | |
-w | --nodelist Request specific list of hosts | |
Before using this, check QOSMaxNodePerJobLimit | |
-x | --exclude Exclude the list of nodes passed | |
[1]: Option '-a' can come after '-r' to set different account name | |
[2]: Use "quotes" for list of nodes. Eg: "gnode[007-008,043]" | |
Exit codes: | |
0 Script executed successfully | |
1 Argument error (some wrong option was passed) | |
5 Some error happened in the interactive session | |
127 Command not found | |
References: | |
- sacct: https://slurm.schedmd.com/sacct.html | |
- salloc: https://slurm.schedmd.com/salloc.html | |
- srun: https://slurm.schedmd.com/srun.html | |
EOF | |
} | |
function parse_options () { | |
# Set the passed bash options | |
set -- $ARGS | |
while (( $# )) ; do | |
arg=$1 # Read option | |
shift # Shift to next argument | |
case "$arg" in | |
# Help options | |
"--help" | "-h") | |
usage | |
exit 0 | |
;; | |
# CPUs | |
"--cpu" | "-c") | |
num_cpus=$1 | |
shift | |
echo_debug "Using $num_cpus CPUs" | |
NUM_CPUS=$num_cpus | |
;; | |
# GPUs | |
"--gpu" | "-g") | |
num_gpus=$1 | |
shift | |
echo_debug "Using $num_gpus GPUs" | |
NUM_GPUS=$num_gpus | |
;; | |
# Number of nodes | |
"--nodes" | "-N") | |
num_nodes=$1 | |
shift | |
echo_debug "Using $num_nodes nodes on HPC" | |
NUM_NODES=$num_nodes | |
;; | |
# Reservation | |
"--reservation" | "-r") | |
reserv_name=$1 | |
shift | |
echo_debug "Using '$reserv_name' reservation" | |
ACCOUNT=$reserv_name | |
RESERVATION=$reserv_name | |
;; | |
# Account name | |
"--account" | "-a") | |
acc_name=$1 | |
shift | |
echo_debug "Account name set to '$acc_name'" | |
ACCOUNT=$acc_name | |
;; | |
# Job name | |
"--name" | "-J") | |
job_name=$1 | |
shift | |
echo_debug "Job name set to '$job_name'" | |
JOB_NAME=$job_name | |
;; | |
# Exclude the list of nodes | |
"--exclude" | "-x") | |
exclude_nodes=$1 | |
shift | |
echo_debug "Excluding nodes in '$exclude_nodes'" | |
EXCLUDE_NODES=$exclude_nodes | |
;; | |
# Node list | |
"--nodelist" | "-w") | |
use_nodes=$1 | |
shift | |
echo_debug "Using nodes in '$use_nodes'" | |
NODE_LIST=$use_nodes | |
;; | |
# Shell | |
"--shell" | "-s") | |
custom_shell=$1 | |
shift | |
echo_debug "Using shell $custom_shell" | |
shell_path="$(which $custom_shell 2> /dev/null)" | |
if [[ -z $shell_path ]]; then | |
echo_warn "Shell $custom_shell not found, using bash" | |
else | |
echo_debug "Found shell at $shell_path" | |
SHELL_PATH=$shell_path | |
fi | |
;; | |
# Time limit for allocation | |
"--time" | "-t") | |
new_time=$1 | |
shift | |
echo_debug "Setting time limit from $TIME_PERIOD to $new_time" | |
TIME_PERIOD=$new_time | |
;; | |
# Dry run | |
"--dry-run" | "-n") | |
echo_info "Dry run mode detected" | |
EXEC_SCRIPT=false | |
;; | |
# Previous node | |
"--last-node" | "-l") | |
if [[ -z $sacct_bin ]]; then | |
echo_fatal "Could not find sacct" | |
exit 127 | |
fi | |
echo_debug "Found $sacct_bin" | |
# Check if user exists | |
qry_cmd="sacct -u "$USER" -o NodeList -S now-7days" | |
echo_command $qry_cmd | |
$qry_cmd &> /dev/null | |
ec=$? | |
if [[ $ec -ne 0 ]]; then | |
echo_fatal "sacct could not run (maybe user not found)" | |
fi | |
gn=$($qry_cmd | grep $COMNMS | tail -n 1 | sed "s/ //g") | |
echo_debug "$USER last used '$gn' NodeList" | |
NODE_LIST=$gn | |
;; | |
# Begin time | |
"--begin" | "-b") | |
begin_shell=$1 | |
shift | |
echo_debug "Begin time: '$begin_shell'" | |
START_TIME=$begin_shell | |
;; | |
# Show version | |
"--version" | "-v") | |
echo_info "Version: $VERSION" | |
echo_debug "Last Modified: $LAST_MDATE" | |
exit 0 | |
;; | |
# RAM per CPU | |
"--mem-per-cpu" | "-m") | |
ram_cpu=$1 | |
shift | |
echo_debug "RAM per CPU: $ram_cpu" | |
MEM_PER_CPU=$ram_cpu | |
;; | |
# User queue | |
"--queue-me" | "-M") | |
sq_cmd="squeue -u $USER -o \"%.10i %.15j %.3t %.10M %.4C %20S %9g %.8a %.7m %N(%r)\"" | |
echo_command $sq_cmd | |
eval $sq_cmd | |
echo_debug "Queue information display completed" | |
exit 0 | |
;; | |
# Node information | |
"--node-info" | "-i") | |
nodelist=$1 | |
shift | |
echo_debug "Printing information on nodes: $nodelist" | |
# Show the queue through squeue | |
readonly squeue_bin=$(which squeue 2> /dev/null) | |
if [[ -z $squeue_bin ]]; then | |
echo_fatal "Could not find squeue" | |
exit 127 | |
fi | |
squeue_cmd="$squeue_bin -w $nodelist" | |
squeue_cmd="$squeue_cmd -o \"%.10i %.15j %.3t %.10M %.4C %20S %9g %.7m %N(%r)\"" | |
echo_command $squeue_cmd | |
eval $squeue_cmd | |
echo "" | |
# Show the pestat output | |
readonly pestat_bin=$(which pestat 2> /dev/null) | |
if [[ -z $pestat_bin ]]; then | |
echo_warn "Could not find pestat" | |
else | |
pestat_cmd="$pestat_bin -w $nodelist" | |
echo_command $pestat_cmd | |
eval $pestat_cmd | |
echo "" | |
fi | |
# Show the sinfo output | |
readonly sinfo_bin=$(which sinfo 2> /dev/null) | |
if [[ -z $sinfo_bin ]]; then | |
echo_fatal "Could not find sinfo" | |
exit 127 | |
fi | |
of="Partition:12,Time:12,StateCompact:8,Gres:8,\ | |
GresUsed:10,CPUsState:.15,NodeList:.20" | |
of=$(echo $of | tr -d '[:space:]') | |
sinfo_cmd="$sinfo_bin -n $nodelist -O $of" | |
echo_command $sinfo_cmd | |
eval $sinfo_cmd | |
echo "" | |
# Exit | |
echo_debug "Information display completed" | |
exit 0 | |
;; | |
# Job (number) information | |
"--job-info" | "-j") | |
job_num=$1 | |
shift | |
echo_debug "Getting information on job number: $job_num" | |
# Scontrol information | |
readonly scontrol_bin=$(which scontrol 2> /dev/null) | |
if [[ -z $scontrol_bin ]]; then | |
echo_fatal "Could not find scontrol" | |
exit 127 | |
fi | |
scontrol_cmd="$scontrol_bin show job -d $job_num" | |
echo_command $scontrol_cmd | |
eval $scontrol_cmd | |
# Account information | |
if [[ -z $sacct_bin ]]; then | |
echo_fatal "Could not find sacct" | |
exit 127 | |
fi | |
sacct_cmd="$sacct_bin -j $job_num" | |
echo_command $sacct_cmd | |
eval $sacct_cmd | |
echo "" | |
# Job efficiency information | |
readonly seff_bin="$(which seff 2> /dev/null)" | |
if [[ -z $seff_bin ]]; then | |
echo_fatal "Could not find seff" | |
exit 127 | |
fi | |
seff_cmd="$seff_bin $job_num" | |
echo_command $seff_cmd | |
eval $seff_cmd | |
echo "" | |
# Exit | |
echo_debug "Information display completed" | |
exit 0 | |
;; | |
# Partition | |
"--partition" | "-p") | |
partition=$1 | |
shift | |
echo_debug "Using partition: $partition" | |
PARTITION=$partition | |
;; | |
# Update the script | |
"--update" | "-u") | |
echo_debug "Updating from URL ${PROG_URL}" | |
cd $PROGPATH | |
wget_cmd="wget $PROG_URL -O - > $PROGNAME.new" | |
echo_command $wget_cmd | |
eval $wget_cmd | |
chmod u+x ./$PROGNAME.new | |
mv ./$PROGNAME.new ./$PROGNAME | |
echo_info "Update completed" | |
exit 0 | |
;; | |
# Kill all SLURM jobs | |
"--kill-all" | "-K") | |
echo_info "Going to kill all tasks run by SLURM account" | |
jbs=$(squeue --me -O JOBID | awk '(NR>1) {print $0}' | xargs) | |
if [[ -z $job ]]; then | |
echo_info "User has no jobs" | |
exit 0 | |
fi | |
echo_debug "Job IDs: $jbs" | |
# Find scancel | |
readonly scancel_bin="$(which scancel 2> /dev/null)" | |
if [[ -z $scancel_bin ]]; then | |
echo_fatal "Could not find scancel" | |
exit 127 | |
fi | |
kill_cmd="$scancel_bin $jbs" | |
echo_command $kill_cmd | |
eval $kill_cmd | |
exit_code=$? | |
if [[ ! $exit_code -eq 0 ]]; then | |
echo_warn "Exit code: $exit_code" | |
exit $exit_code | |
fi | |
# Exit | |
echo_debug "All SLURM jobs have been terminated" | |
exit 0 | |
;; | |
*) | |
echo_warn "Unrecognized option: $arg" | |
echo_info "Pass --help to get options and help" | |
exit 1 | |
;; | |
esac | |
done | |
} | |
function create_salloc_opts () { | |
# CPUs | |
SALLOC_OPTS="-n $NUM_CPUS" | |
# Job name | |
SALLOC_OPTS="$SALLOC_OPTS -J $JOB_NAME" | |
# Partition | |
SALLOC_OPTS="$SALLOC_OPTS -p $PARTITION" | |
# Account | |
SALLOC_OPTS="$SALLOC_OPTS -A $ACCOUNT" | |
# Number of nodes | |
SALLOC_OPTS="$SALLOC_OPTS -N $NUM_NODES" | |
# If there is a reservation | |
if [[ -n $RESERVATION ]]; then | |
SALLOC_OPTS="$SALLOC_OPTS --reservation=$RESERVATION" | |
fi | |
# GRES (GPUs) | |
SALLOC_OPTS="$SALLOC_OPTS --gres=gpu:$NUM_GPUS" | |
# If there's a start time given | |
if [[ -n $START_TIME ]]; then | |
SALLOC_OPTS="$SALLOC_OPTS --begin=$START_TIME" | |
fi | |
# If there's a RAM per CPU requirement | |
if [[ -n $MEM_PER_CPU ]]; then | |
SALLOC_OPTS="$SALLOC_OPTS --mem-per-cpu=$MEM_PER_CPU" | |
fi | |
# Time | |
SALLOC_OPTS="$SALLOC_OPTS -t $TIME_PERIOD" | |
# If some nodes must be excluded | |
if [[ -n $EXCLUDE_NODES ]]; then | |
SALLOC_OPTS="$SALLOC_OPTS -x $EXCLUDE_NODES" | |
fi | |
# If specific nodes must be used (distributed probably) | |
if [[ -n $NODE_LIST ]]; then | |
SALLOC_OPTS="$SALLOC_OPTS -w $NODE_LIST" | |
SALLOC_OPTS="$SALLOC_OPTS --ntasks-per-node=$NUM_CPUS" | |
fi | |
echo_info "SALLOC $SALLOC_OPTS" | |
} | |
function create_srun_opts () { | |
# CPUs | |
SRUN_OPTS="-n $NUM_CPUS" | |
# Pseudoterminal | |
SRUN_OPTS="$SRUN_OPTS --pty" | |
# Do not bind tasks to CPUs (they may be different!) | |
SRUN_OPTS="$SRUN_OPTS --cpu-bind=no" | |
# Shell executable (login shell) | |
SRUN_EXEC="$SHELL_PATH -l" | |
echo_info "SRUN $SRUN_OPTS $SRUN_EXEC" | |
} | |
function env_check () { | |
# Check if display forwarding exists | |
if [[ -n $DISPLAY ]]; then | |
# Display exists | |
echo_debug "Display found at: $DISPLAY" | |
else | |
echo_warn "No display found, X11 forwarding may not work" | |
fi | |
} | |
# ==== Main program entrypoint ==== | |
parse_options | |
create_salloc_opts | |
create_srun_opts | |
env_check | |
sess_start_time=$(date) | |
echo_info -e "Starting time: $sess_start_time" | |
# Main command | |
echo_command "$SALLOC_BIN $SALLOC_OPTS $SRUN_BIN $SRUN_OPTS $SRUN_EXEC" | |
if [[ "$EXEC_SCRIPT" = true ]]; then | |
$SALLOC_BIN $SALLOC_OPTS $SRUN_BIN $SRUN_OPTS $SRUN_EXEC | |
else | |
echo_debug "Not running command (dry run)" | |
fi | |
# Interactive session ended | |
exit_code=$? | |
EXIT_STATUS=0 | |
if [[ $exit_code -eq 0 ]]; then | |
echo_debug "Script ended with no error (zero exit code)" | |
else | |
echo_warn "Exit status of session was $exit_code" | |
echo_warn "Something might have gone wrong" | |
EXIT_STATUS=5 | |
fi | |
sess_end_time=$(date) | |
echo_info -e "Ending time: $sess_end_time" | |
echo_debug -e "Started at: $sess_start_time" | |
sess_dur=$(echo $(date -d "$sess_end_time" +%s) - $(date -d "$sess_start_time" +%s) | bc -l) | |
echo_info -e "Duration (HH:MM:SS format): `date -d@$sess_dur -u +%H:%M:%S`" | |
exit $EXIT_STATUS | |
# The below command works: | |
# $SALLOC_BIN -n 10 -J interactive -p long -A research --gres=gpu:1 \ | |
# -t 6:00:00 $SRUN_BIN -n 10 --pty --cpu_bind=no bash -l |
The new release checks for job history in the last 7 days
The new release (version 1.3
) has -v
for version, and -b
for adding a custom starting time.
New release (version 1.4
) has -m
for RAM (memory) per CPU
New release (version 1.5
) has -i
for getting node(s) information
New release (version 1.6
) has -N
for getting a number of nodes for allocation.
New release (version 1.7
) has -p
for getting a particular partition for allocation.
New release (version 1.8
) has -M
for getting squeue of user.
New release (version 1.9
) has -t
for time allocation option.
New release (version 1.10
) has -j
for getting job information. Version also shows the last modified date. The script also shows the duration for which the interactive session lasted.
New release (version 1.11
) has -u
to update the utility (no need to download the script from GitHub gist periodically anymore).
New release (version 1.12
) has -K
to kill all running SLURM jobs (for the particular user). Use this option carefully
Moved the main script to my DotFiles
The new release has
-l
to use the gnode that was last allotted to you, and-n
for a dry run