Skip to content

Instantly share code, notes, and snippets.

@caljess599
Last active February 26, 2018 16:28
Show Gist options
  • Save caljess599/0fe3e0230c7c4ac758d661bd2af3c124 to your computer and use it in GitHub Desktop.
Save caljess599/0fe3e0230c7c4ac758d661bd2af3c124 to your computer and use it in GitHub Desktop.
#!/bin/bash
# name: whonode
# location: /bin/whonode
# # FUNCTIONS # #
usage () {
printf "usage: %s [-c] [-h] [-n cvpost<num> ] [-j JobID]* [-u user]* [-r] [-s]\n*flag can be passed multiple times\nFor complete documentation, go to https://staff.nrao.edu/wiki/bin/view/CIS/ClusterKnowHowVisible\n" $0 >&2
exit 2
}
transform(){
#time elapsed
jid=$(echo "$1" | awk '{print $1}')
counter=1
for id in $jid; do
allnodeid=$(echo "$QSTATF" | sed -n "/$id/,/exec_port/p" | grep "exec_host" | awk '{print $3}')
echo "$1" | awk 'FNR=='$counter' { if ($9 == "--") { printf "'"${allnodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\tBATCH JOB\t%s\tBATCH JOB\n", $1, $2, $5, $6, $7, $10} else { printf "'"${allnodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $5, $6, $7, $9, $10, $11 } }'
counter=$((counter+1))
done
}
transform2(){
#hours left
jid=$(echo "$1" | awk '{print $1}')
counter=1
for id in $jid; do
allnodeid=$(echo "$QSTATF" | sed -n "/$id/,/exec_port/p" | grep "exec_host" | awk '{print $3}')
echo "$1" | awk 'FNR=='$counter' {if ($9 == "--") {printf "'"${allnodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\tBATCH JOB\t%s\tN/A\n", $1, $2, $5, $6, $7, $10 } else {printf "'"${allnodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $5, $6, $7, $9, $10, $9-$11 } }'
counter=$((counter+1))
done
}
headline(){
printf "Node Name\tJob ID\t\t\tUsername\tSessID\tNodes\tTasks\tTime Rsvd\tStatus\tTime Elapsed\n"
}
headline2(){
printf "Node Name\tJob ID\t\t\tUsername\tSessID\tNodes\tTasks\tTime Rsvd\tStatus\tHrs Left\n"
}
altheadline(){
printf "Job ID\t\t\tUsername\tSessID\tNodes\tTasks\tTime Rsvd\tStatus\tHrs Left\n"
}
calc(){ bc -l <<< "$@"; }
# # # GLOBAL VARIABLES # # #
PATH=/opt/services/torque/bin:$PATH
#temporary values
#TOTALNODES=63
#TOTALIJNODES=61
# regular values
TOTALNODES=64
# these two must add up to TOTALNODES
TOTALIJNODES=59
TOTALBATCHONLY=5
#two calls to qstat, output preserved and reused
QSTATA=$(qstat -a)
QSTATF=$(qstat -f)
# # # MAIN PROGRAM # # #
main(){
# # ARGUMENTS # #
cflag=
uflag=
while getopts 'chj:n:rsu:x' OPTION
do
case $OPTION in
c) cflag=1
;;
h) hflag=1
;;
j) jflag=1
jval+=("$OPTARG")
;;
n) nflag=1
nval=("$OPTARG")
;;
r) rflag=1
;;
s) sflag=1
;;
u) uflag=1
uval+=("$OPTARG")
;;
?) usage
exit
esac
done
shift $(($OPTIND - 1))
# # LOGIC # #
if [ "$hflag" ]; then
echo "****************** W H O N O D E H E L P *********************"
echo "Usage: whonode [-c] [-h] [-n cvpost<num> ] [-j JobID]* [-u user]* [-r] [-s]"
echo "* indicates that flag can be passed multiple times"
echo by default, \'whonode\' printout includes time elapsed and is sorted by most time elapsed first
echo
echo Various flags can modify this list:
echo "-r | replaces time elapsed with hours remaining"
echo "-s | sorts by nodename instead of by elapsed time. Can be combined with -r"
echo "-u username | prints reservations only for a specified username. Can be combined with -r"
echo
echo Other commands:
echo "-c | prints only a count of reserved nodes and pending reservations"
echo "-h | prints this help message"
echo "-j JobID | prints cvpostnode corresponding to given JobID, e.g., 4789"
echo "-n cvpostnode | prints out who is on the specified node, e.g., cvpost061"
echo
echo "Complete documentation is available at https://staff.nrao.edu/wiki/bin/view/CIS/ClusterKnowHowVisible"
exit
fi
# count how many nodes are in use (batch vs interactive) and how many reservations queued
# count how many JOBS of each type (batch vs. interactive)
#[root@cvpost-master ~]# qstat -a | grep "[0-9].cvpost-serv" | awk '$3 == "$type"' #where type is either 'batch' or 'interact'
# $1 $2 $3 $4 $5 $6 $7 $8 $9 $10 $10
# Job ID Username Queue Jobname SessID NDS TSK Memory Reqt Status Elsp
#5153.cvpost-serv c***** batch c4r2 --- 145643 1 1 245gb -- R 00:00:00
#4876.cvpost-serv t***** interact interactive_j 8478 1 1 -- 1080:00:0 R 665:22:21
# the jobID is the key to counting how many (and which) nodes are in use (and in what way)
# list of all jobs
allnodes=$(echo "$QSTATA" | grep "[0-9].cvpost-serv") # gets ALL the jobs, including those pending
# current (as opposed to pending jobs) are distinguished by the lack of a SessID
# the lists of ALL jobs (IJs and BJs), assigned/requested|running/queued is used for the transform functions
assigned=$(echo "$allnodes" | awk '$5 !~ /--/')
reserved=$(echo "$allnodes" | awk '$5 ~ /--/')
# segretated lists are needed for counting
# list of assigned interactive-node jobs
assignedijs=$(echo "$allnodes" | awk '($3 == "interact") && ($5 !~ /--/)')
# list of running batch jobs
runningbjs=$(echo "$allnodes" | awk '($3 == "batch") && ($5 !~ /--/)')
# AND NOW for the pending side....
# list of requested interactive-node jobs
requestedijs=$(echo "$allnodes" | awk '($3 == "interact") && ($5 ~ /--/)')
# list queued batch jobs
queuedbjs=$(echo "$allnodes" | awk '($3 == "batch") && ($5 ~ /--/)')
# NOW FOR THE STATS (NUMBER OF JOBS/NODES ETC)
# Now the number of nodes in use
# For the interactive-node jobs (IJs), we use the same strategy as before (count the NDS ($6)
assignedijsnum=$(echo "$assignedijs" | awk '{s+=$6} END {print s}')
requestedijsnum=$(echo "$requestedijs" | awk '{s+=$6} END {print s}')
# We can just wc -l to get the number of batch jobs running
if [ -z "$runningbjs" ]; then
runningbjscount=0
else
runningbjscount=$(echo "$runningbjs" | wc -l)
fi
# For the NUMBER OF NODES the running batch jobs (BJs) are consuming, we have to use qstat (because batch jobs can share nodes)
runningbjsnum=$(for i in $(echo "$runningbjs" | awk '{print $1}'); do echo "$QSTATF" | sed -n "/$i/,/exec_port/p" | grep exec_host | awk '{print $3}'; done | uniq -w 9 | wc -l)
#grammar police!
if [ $runningbjsnum -eq 1 ]; then
runningbjsnumnode=node
runningbjsnumnodeverb=is
else
runningbjsnumnode=nodes
runningbjsnumnodeverb=are
fi
# For the queued BJs, a simple count will do (if the string is not empty)
if [ -z "$queuedbjs" ]; then
queuedbjsnum=0
else
queuedbjsnum=$(echo "$queuedbjs" | wc -l)
fi
#grammar police!
if [ $queuedbjsnum -eq 1 ];then
queuedbjsnumjob=job
else
queuedbjsnumjob=jobs
fi
# total nodes currently in use (BJs and IJs)
totalnodesinuse=$(calc "$assignedijsnum" + "$runningbjsnum")
# determine node of given JobID (qstat -f)
if [ "$jflag" ]; then
for val in "${jval[@]}"; do
result=$(echo "$QSTATF" | sed -n "/$val.cvpost/,/exec_port/p" | grep exec_host | awk '{print $3}')
if [ -z $result ]; then
echo JobID $val cannot be found
else
echo JobID $val is on $result
fi
done
exit
fi
# determine reserver (if present) of given node (qstat -f)
if [ "$nflag" ]; then
#parse nval
if [[ $nval =~ cvpost00[1-9]|cvpost0[1-5][0-9]|cvpost06[0-4] ]]; then
var=$(echo "$QSTATF" | grep -B11 $nval | grep Job_Owner | awk '{print $3}')
var=${var%@*}
if [ -z $var ]; then
echo "$nval is not currently reserved"
else
echo "$nval is reserved by $var"
fi
else
echo "$nval is not a valid cluster node name"
exit
fi
fi
#no task-specific flags passed; print all info for all users (in descending order by node)
#including count of nodes reserved
if [[ ! "$cflag" && ! "$uflag" && ! "$jflag" && ! "$nflag" ]]; then
#which headline/output? print hours left or time elapsed?
if [ "$rflag" ]; then
# print hours left
headline2
# now sort column
if [ "$sflag" ]; then
# sort by nodename
transform2 "$assigned" | sort -r -k1
else
#sort by 9th column
transform2 "$assigned" | sort -n -k9
fi
else
#print time elapsed
headline
# now sort column
if [ "$sflag" ]; then
# sort by nodename
transform "$assigned" | sort -r -k1
else
#sort by 9th column
transform "$assigned" | sort -n -r -k9
fi
fi
echo
echo ----------------------------------------
if [ $assignedijsnum -eq 1 ]; then
# only 1 node reserved--unlikely but let's have proper grammar if so
echo "$assignedijsnum (out of $TOTALIJNODES) HPC node is currently reserved for interactive use."
else
# zero or >1 nodes reserved
echo "$assignedijsnum (out of $TOTALIJNODES) HPC nodes are currently reserved for interactive use."
fi
if [ $runningbjscount -eq 1 ]; then
echo $runningbjscount batch job is currently running on $runningbjsnum $runningbjsnumnode.
else
echo $runningbjscount batch jobs are currently running on $runningbjsnum $runningbjsnumnode.
fi
echo $TOTALBATCHONLY nodes are permanently reserved for batch jobs.
echo
echo Total nodes in use: $totalnodesinuse out of $TOTALNODES
echo ----------------------------------------
if [ $requestedijsnum -eq 0 ]; then
echo There are no pending node reservations.
elif [ $requestedijsnum -eq 1 ]; then
echo There is $requestedijsnum node reservation.
echo "$requestedijs"
else
echo Reservations are currently pending for $requestedijsnum nodes:
echo "$requestedijs"
fi
if [ $queuedbjsnum -eq 0 ]; then
echo There are no batch jobs currently queued.
elif [ $queuedbjsnum -eq 1 ]; then
echo There is $queuedbjsnum batch job currently queued.
echo "$queuedbjs"
else
echo $queuedbjsnum jobs are currently in the batch-job queue:
echo "$queuedbjs"
fi
exit
fi
#count flag but not user flag, i.e., count all reserved nodes (qstat -a)
if [[ "$cflag" && ! "$uflag" ]]; then
if [ $requestedijsnum -eq 1 ]; then
# super unlikely that $assignedijsnum nodes will be zero, so not coding for it
# only 1 reservation pending
echo "$assignedijsnum (out of $TOTALIJNODES) nodes are currently reserved, with $requestedijsnum reservation pending."
else
# zero or > 1 reservation pending
echo "$assignedijsnum (out of $TOTALIJNODES) nodes are currently reserved, with $requestedijsnum reservations pending."
fi
echo $runningbjsnum $runningbjsnumnode $runningbjsnumnodeverb in use for batch jobs, with $queuedbjsnum $queuedbjsnumjob in the queue.
echo $TOTALBATCHONLY nodes are permanently reserved for batch jobs.
exit
fi
# user flag only (list reserved nodes by specified user)
# yes, it is plain that this should be reformulated not to redo the transform() function
if [ "$uflag" ]; then
if [ "$cflag" ]; then
echo $assignednum nodes are currently reserved, with $reservednum node reservations pending.
fi
for val in "${uval[@]}"; do
echo
assnodes=$(echo "$assigned" | grep $val) # gets ALL the nodes (including batch jobs) belonging to a user
# uses the built-in count of ONLY INTERACTIVE nodes per res to total reserved
assnodenum=$(echo "$assignedijs" | grep $val | awk '{s+=$6} END {print s}')
if [ ! -z $assnodenum ]; then
if [ "$rflag" ]; then
headline2
else
headline
fi
fi
jid=$(echo "$assnodes" | awk '{print $1}')
count=1
for id in $jid; do
nodeid=$(echo "$QSTATF" | sed -n "/$id/,/exec_port/p" | grep "exec_host" | awk '{print $3}')
if [ "$rflag" ]; then
echo "$assnodes" | awk 'FNR=='$count' { if ($9 == "--") { printf "'"${nodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\tBATCH JOB\t%s\tN/A\n", $1, $2, $5, $6, $7, $10 } else { printf "'"${nodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $5, $6, $7, $9, $10, $9-$11 } }'
else
echo "$assnodes" | awk 'FNR=='$count' { if ($9 == "--") { printf "'"${nodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\tBATCH JOB\t%s\tBATCH JOB\n", $1, $2, $5, $6, $7, $10 } else { printf "'"${nodeid/\/0+cvpost0/+}"'\t%s\t%-8s\t%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $5, $6, $7, $9, $10, $11 } }'
fi
count=$((count+1))
done
if [ -z $assnodenum ]; then
echo "User $val has zero nodes reserved."
elif [ $assnodenum -eq 1 ]; then
echo "User $val has $assnodenum node reserved."
else
echo "User $val has $assnodenum nodes reserved."
fi
done
for val in "${uval[@]}"; do
echo
#altheadline
resnodes=$(echo "$reserved" | grep $val) # gets ALL the nodes belonging to a user
# uses the built-in count of nodes per res to total actual nodes reserved
resnodenum=$(echo "$reserved" | grep $val | awk '{s+=$6} END {print s}')
if [ ! -z $resnodenum ]; then
altheadline
fi
jid=$(echo "$resnodes" | awk '{print $1}')
count=1
for id in $jid; do
nodeid=$(echo "$QSTATF" | sed -n "/$id/,/exec_port/p" | grep "exec_host" | awk '{print $3}')
if [ "$rflag" ]; then
echo "$resnodes" | awk 'FNR=='$count' { if ($9 == "--") { printf "%s\t%-8s\t%s\t%s\t%s\tBATCH JOB\t%s\tN/A\n", $1, $2, $5, $6, $7, $10 } else { printf "%s\t%-8s\t%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $5, $6, $7, $9, $10, $9-$11 } }'
else
echo "$resnodes" | awk 'FNR=='$count' { if ($9 == "--") { printf "%s\t%-8s\t%s\t%s\t%s\tBATCH JOB\t%s\tBATCH JOB\n", $1, $2, $5, $6, $7, $10} else { printf "%s\t%-8s\t%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $5, $6, $7, $9, $10, $11 } }'
fi
count=$((count+1))
done
if [ -z $resnodenum ]; then
echo "User $val has zero reservations in the queue."
elif [ $resnodenum -eq 1 ]; then
echo "User $val has $resnodenum reservation in the queue."
else
echo "User $val has $resnodenum reservations in the queue."
fi
done
fi
} # END OF MAIN
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment