Last active
August 29, 2015 14:21
-
-
Save tcooper/bd40bac59c4d6660ac92 to your computer and use it in GitHub Desktop.
Slurm utilization for Graphite
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Usage : Show queue utilization for running & pending jobs | |
E_INVALID_OPT=1 | |
usage() { echo "Usage: $0" 1>&2; exit 1; } | |
cleanup () { /bin/rm $_squeue_tmp; } | |
while getopts ":h" opt; do | |
case $opt in | |
h) | |
usage | |
;; | |
\?) | |
echo "Error: Invalid option \"-$OPTARG\"" >&2 | |
exit $E_INVALID_OPT | |
;; | |
esac | |
done | |
# Store timestamp and hostname for later use... | |
_timestamp=$(date +%s) | |
_hostname=$(hostname -s) | |
_prefix="hosts.hpc.${_hostname}." | |
# Temp file... | |
_squeue_tmp=$(mktemp) | |
# Various squeue values... | |
#JOBID PARTITION ST SUBMIT_TIME START_TIME TIME TIME_LIMIT TIME_LEFT NODES CPUS GRES | |
#540278 compute R 2015-05-12T14:13:05 2015-05-12T14:33:37 9:52:29 18:00:00 8:07:31 2 48 null | |
#539917 compute R 2015-05-11T10:08:07 2015-05-12T11:28:17 12:57:49 1-00:00:00 11:02:11 72 1728 null | |
#529933 compute R 2015-05-08T02:07:24 2015-05-08T17:10:18 4-07:15:48 5-00:00:00 16:44:12 2 48 null | |
#512211 compute PD 2015-04-27T13:23:19 N/A 0:00 2-00:00:00 2-00:00:00 2 48 null | |
#528687 compute PD 2015-05-07T12:29:57 N/A 0:00 2-00:00:00 2-00:00:00 10 240 null | |
#540326 shared R 2015-05-12T18:46:00 2015-05-12T21:52:28 2:34:48 7-00:00:00 6-21:25:12 1 23 null | |
#534022 shared PD 2015-05-08T17:32:47 2015-05-14T00:27:00 0:00 37:00 37:00 1 24 null | |
#538537_[0-27] gpu PD 2015-05-10T06:37:09 2015-05-14T00:26:00 0:00 2-00:00:00 2-00:00:00 6 24 gpu:4 | |
#540058 gpu PD 2015-05-11T20:39:04 2015-05-14T00:26:00 0:00 5:00 5:00 24 24 gpu:1 | |
#540059 gpu-shared PD 2015-05-11T20:39:19 2015-05-14T00:26:00 0:00 5:00 5:00 1 1 gpu:1 | |
#531366 gpu PD 2015-05-08T14:41:09 2015-05-14T00:26:00 0:00 1:30:00 1:30:00 1 24 gpu:4 | |
# Call Slurm squeue command for running & pending jobs with custom output format and remove parens | |
# '()' around empty gres. Store result in temp file and parse later line by line... | |
/usr/bin/squeue -h --state=running,pending --format="%i %P %t %V %S %M %l %L %D %C %b" | \ | |
/bin/sed 's/(null)/null/g' > ${_squeue_tmp} | |
# Convert variable length time string to seconds | |
# Parsable inputs are currently... D-HH:MM:SS, HH:MM:SS, MM:SS or SS | |
function dhmsToSecs() { | |
IFS=$'-:' | |
declare i secs=0 | |
declare a _dhms=($@) | |
# NOTE: The 10# ensures we stay in decimal base and 08 and 09 are interpreted as octal | |
if [[ ${#_dhms[@]} -eq 4 ]]; then | |
secs=$(( (10#${_dhms[0]} * 86400) + (10#${_dhms[1]} * 3600) + (10#${_dhms[2]} * 60) + 10#${_dhms[3]} )) | |
elif [[ ${#_dhms[@]} -eq 3 ]]; then | |
secs=$(( (10#${_dhms[0]} * 3600) + (10#${_dhms[1]} * 60) + 10#${_dhms[2]} )) | |
elif [[ ${#_dhms[@]} -eq 2 ]]; then | |
secs=$(( (10#${_dhms[0]} * 60) + 10#${_dhms[1]} )) | |
else | |
secs=$(( 10#${_dhms[0]} )) | |
fi | |
unset IFS | |
echo $secs | |
} | |
# Multiply SUs by job array task count | |
# Input: ArrayJobID, SUs/Job | |
# Output: Total SUs | |
function arrayJobMultiplier() { | |
# To be completed... return SUs for single task for now | |
echo $2 | |
} | |
declare i _r_jobs=0 _r_sus=0 _pd_jobs=0 _pd_sus=0 | |
declare i _r_cpus=0 _r_nodes=0 _pd_cpus=0 _r_cpus=0 | |
# read using the file descriptors | |
exec 3<&0 | |
exec 0<${_squeue_tmp} | |
while read _jobid _partition _state _submit_time _start_time _time _time_limit _time_left _nodes _cpus _gres | |
do | |
declare i _cpuFactor=1 _gresFactor=1 _memFactor=1 | |
declare i _secs=0 _cpuSecs=0 _gresSecs=0 _memSecs=0 | |
declare i _sus=0 | |
if [ "${_partition}" == "gpu" ] || [ "${_partition}" == "gpu-shared" ]; then | |
# GPU's cost 2x/cpu | |
_cpuFactor=2 | |
fi | |
if [ "${_state}" == "PD" ]; then | |
_pd_jobs=$((10#${_pd_jobs} + 1)) | |
_pd_cpus=$((10#${_pd_cpus} + ${_cpus})) | |
_pd_nodes=$((10#${_pd_nodes} + ${_nodes})) | |
_secs=$(dhmsToSecs "${_time_limit}") | |
else | |
_r_jobs=$((10#${_r_jobs} + 1)) | |
_r_cpus=$((10#${_r_cpus} + ${_cpus})) | |
_r_nodes=$((10#${_r_nodes} + ${_nodes})) | |
_secs=$(dhmsToSecs "${_time}") | |
fi | |
_cpuSecs=$((${_cpus}*${_secs}*${_cpuFactor})) | |
_gresSecs=0 #_gresSecs=$((${_gres}*${_secs}*${_gresFactor})) | |
_sus=$(((${_cpuSecs}+${_gresSecs})/3600)) | |
# All jobs cost a minimum of 1 SU | |
[[ ${_sus} -lt 1 ]] && _sus=1 | |
# Array jobs in pending state are listed as a group, running array jobs can be counted individually | |
if [[ ${_jobid} =~ [0-9]+_ ]]; then | |
_sus=$(arrayJobMultiplier ${_jobid} ${_sus}) | |
fi | |
# Track queued and running SUs separately | |
if [ "${_state}" == "PD" ]; then | |
((_pd_sus += ${_sus})) | |
else | |
((_r_sus += ${_sus})) | |
fi | |
done | |
exec 0<&3 | |
echo "${_prefix}.squeue.nodes.util ${_r_nodes} ${_timestamp}" | |
echo "${_prefix}.squeue.cpu.util ${_r_cpus} ${_timestamp}" | |
echo "${_prefix}.squeue.jobs.running ${_r_jobs} ${_timestamp}" | |
echo "${_prefix}.squeue.jobs.queued ${_pd_jobs} ${_timestamp}" | |
echo "${_prefix}.squeue.sus.running ${_r_sus} ${_timestamp}" | |
echo "${_prefix}.squeue.sus.queued ${_pd_sus} ${_timestamp}" | |
cleanup | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment