Skip to content

Instantly share code, notes, and snippets.

@shokoe
Created September 25, 2017 08:54
Show Gist options
  • Save shokoe/2867f86880624ed8f85fb9eba5e49128 to your computer and use it in GitHub Desktop.
Save shokoe/2867f86880624ed8f85fb9eba5e49128 to your computer and use it in GitHub Desktop.
Nagios SQS checks for age, size and throughput
#!/bin/bash
# syntax: check_sqs_age.sh <1:queue name> <2:max delay> <3:age sec warn> <4:age sec crit>
# e.g: check_sqs_age.sh items 900 120 300
# if (warn > crit) alert on high values and vise versa
queue_name=$1
max_delay=$2
warn=$3
crit=$4
base_delay=360
msg_append(){
if [ -z "$msg" ]; then
msg="$1"
else
msg="$msg, $1"
fi
}
ec_esca(){ [ ${ec:-0} -lt $1 ] && ec=$1; }
pptime(){
if [ $1 -gt 129600 ]; then
echo "$(($1/86400))d$(($1%86400/3600))h$(($1%3600/60))m"
elif [ $1 -gt 5400 ]; then
echo "$(($1/3600))h$(($1%3600/60))m"
elif [ $1 -gt 120 ]; then
echo "$(($1/60))m$(($1%60))s"
else
echo "$(($1))s"
fi
}
get_cw(){
unset o
o=$(aws cloudwatch get-metric-statistics --namespace AWS/SQS \
--metric-name $1 \
--dimensions Name=QueueName,Value=$queue_name \
--start-time $((`date +%s`-(300+$base_delay))) --end-time $((`date +%s`-$base_delay)) \
--period 300 --statistics Sum --output text | sed 1d)
if [ ! -z "$o" ]; then
read D m t u <<< "$o"
echo ${m/.*/} $((`date +%s`-`date -d "$t" +%s`))
else
echo "NA NA"
fi
}
read age delay <<< "`get_cw ApproximateAgeOfOldestMessage`"
perf_data="age:$age; delay:$delay;"
### check delays
################
if [ ${delay} -gt $max_delay ]; then
msg_append "delay too big (`pptime $delay`>`pptime $max_delay`)"
ec_esca 2
fi
### check incoming rate
#######################
if [ "$rate_in" = "NA" ]; then
msg_append "unable to get incoming rate for $queue_name"
ec_esca 2
elif [ $warn -gt $crit ]; then
# alert on high values
if [ $age -lt $crit ]; then
msg_append "oldest item age `pptime $age` (<`pptime $crit`) with `pptime $delay` delay"
ec_esca 2
elif [ $age -lt $warn ]; then
msg_append "oldest item age `pptime $age` (<`pptime $warn`) with `pptime $delay` delay"
ec_esca 1
else
msg_append "oldest item age `pptime $age` (>`pptime $warn`) with `pptime $delay` delay"
fi
else
# alert on low values
if [ $age -gt $crit ]; then
msg_append "oldest item age `pptime $age` (>`pptime $crit`) with `pptime $delay` delay"
ec_esca 2
elif [ $age -gt $warn ]; then
msg_append "oldest item age `pptime $age` (>`pptime $warn`) with `pptime $delay` delay"
ec_esca 1
else
msg_append "oldest item age `pptime $age` (<=`pptime $warn`) with `pptime $delay` delay"
fi
fi
echo "$msg | $perf_data"
#echo "<pre>$info</pre>"
exit $ec
#!/bin/bash
# syntax: check_sqs_size.sh <1:queue name> <2:max delay> <3:size warn> <4:size crit>
# e.g: check_sqs_size.sh items 900 20 10
# if (warn > crit) alert on high values and vise versa
queue_name=$1
max_delay=$2
warn=$3
crit=$4
base_delay=360
msg_append(){
if [ -z "$msg" ]; then
msg="$1"
else
msg="$msg, $1"
fi
}
ec_esca(){ [ ${ec:-0} -lt $1 ] && ec=$1; }
pptime(){
if [ $1 -gt 129600 ]; then
echo "$(($1/86400))d$(($1%86400/3600))h$(($1%3600/60))m"
elif [ $1 -gt 5400 ]; then
echo "$(($1/3600))h$(($1%3600/60))m"
elif [ $1 -gt 120 ]; then
echo "$(($1/60))m$(($1%60))s"
else
echo "$(($1))s"
fi
}
get_cw(){
unset o
o=$(aws cloudwatch get-metric-statistics --namespace AWS/SQS \
--metric-name $1 \
--dimensions Name=QueueName,Value=$queue_name \
--start-time $((`date +%s`-(300+$base_delay))) --end-time $((`date +%s`-$base_delay)) \
--period 300 --statistics Sum --output text | sed 1d)
if [ ! -z "$o" ]; then
read D m t u <<< "$o"
echo ${m/.*/} $((`date +%s`-`date -d "$t" +%s`))
else
echo "NA NA"
fi
}
read size delay <<< "`get_cw ApproximateNumberOfMessagesVisible`"
perf_data="size:$size; delay:$delay;"
### check delays
################
if [ ${delay} -gt $max_delay ]; then
msg_append "delay too big (`pptime $delay`>`pptime $max_delay`)"
ec_esca 2
fi
### check incoming rate
#######################
if [ "$rate_in" = "NA" ]; then
msg_append "unable to get incoming rate for $queue_name"
ec_esca 2
elif [ $warn -gt $crit ]; then
# alert on high values
if [ $size -lt $crit ]; then
msg_append "size $size (<$crit) with `pptime $delay` delay"
ec_esca 2
elif [ $size -lt $warn ]; then
msg_append "size $size (<$warn) with `pptime $delay` delay"
ec_esca 1
else
msg_append "size $size (>$warn) with `pptime $delay` delay"
fi
else
# alert on low values
if [ $size -gt $crit ]; then
msg_append "size $size (>$crit) with `pptime $delay` delay"
ec_esca 2
elif [ $size -gt $warn ]; then
msg_append "size $size (>$warn) with `pptime $delay` delay"
ec_esca 1
else
msg_append "size $size (<=$warn) with `pptime $delay` delay"
fi
fi
echo "$msg | $perf_data"
#echo "<pre>$info</pre>"
exit $ec
#!/bin/bash
# syntax: check_sqs_throughput.sh <1:queue name> <2:max delay> <3:incoming warn> <4:incoming crit>
# e.g: check_sqs_throughput.sh jobs 900 20 10
# if (warn > crit) alert on high values and vise versa
queue_name=$1
max_delay=$2
in_warn=$3
in_crit=$4
base_delay=360
msg_append(){
if [ -z "$msg" ]; then
msg="$1"
else
msg="$msg, $1"
fi
}
ec_esca(){ [ ${ec:-0} -lt $1 ] && ec=$1; }
pptime(){
if [ $1 -gt 129600 ]; then
echo "$(($1/86400))d$(($1%86400/3600))h$(($1%3600/60))m"
elif [ $1 -gt 5400 ]; then
echo "$(($1/3600))h$(($1%3600/60))m"
elif [ $1 -gt 120 ]; then
echo "$(($1/60))m$(($1%60))s"
else
echo "$(($1))s"
fi
}
to_sec(){ awk -v a=$1 'BEGIN{printf "%0.2f", a/300}' | sed 's#\.00$##'; }
to_five(){ awk -v a=$1 'BEGIN{printf "%d", a*300}'; }
get_cw(){
unset o
o=$(aws cloudwatch get-metric-statistics --namespace AWS/SQS \
--metric-name $1 \
--dimensions Name=QueueName,Value=$queue_name \
--start-time $((`date +%s`-(300+$base_delay))) --end-time $((`date +%s`-$base_delay)) \
--period 300 --statistics Sum --output text | sed 1d)
if [ ! -z "$o" ]; then
read D m t u <<< "$o"
#echo ${m/.*/} $((${m/.*/}/300)) $((`date +%s`-`date -d "$t" +%s`))
echo ${m/.*/} `to_sec $m` $((`date +%s`-`date -d "$t" +%s`))
else
echo "NA"
fi
}
read raw_out rate_out delay_out <<< "`get_cw NumberOfMessagesReceived`"
read raw_in rate_in delay_in <<< "`get_cw NumberOfMessagesSent`"
read raw_del rate_del delay_del <<< "`get_cw NumberOfMessagesDeleted`"
perf_data=$(for z in ${!rate_*} ${!delay_*}; do echo -n "$z:${!z}; "; done
echo -n "lost:`to_sec $(($raw_del-$raw_out))`;")
info=$(echo "metric item/s delay delay(s)
out $rate_out `pptime $delay_out` $delay_out
in $rate_in `pptime $delay_in` $delay_in
del $rate_del `pptime $delay_del` $delay_del" | column -t)
info="$info
lost items: `to_sec $(($raw_del-$raw_out))`"
### check delays
################
delays_err=0
for d in ${!delay_*}; do
n=${d/delay_/}
if [ ${!d} -gt $max_delay ]; then
delays_list="$delay_list $n"
delays_err=1
fi
#echo "$d($n):${!d}"
done
if (($delays_err)); then
msg_append "delays too big (${delays_list/ /}>$max_delay)"
ec_esca 2
fi
### check lost
##############
if [ `echo $(($delay_out-$delay_del)) | sed 's#-##'` -gt 30 ] && [ $(($raw_del-$raw_out)) -gt 0 ]; then
msg_append "found `to_sec $(($raw_del-$raw_out))` lost items"
ec_esca 2
fi
### check incoming rate
#######################
if [ "$rate_in" = "NA" ]; then
msg_append "unable to get incoming rate for $queue_name"
ec_esca 2
elif [ `to_five $in_warn` -ge `to_five $in_crit` ]; then
# alert on high values
#if [ $rate_in -lt $in_crit ]; then
if [ $raw_in -lt `to_five $in_crit` ]; then
msg_append "incoming rate ${rate_in} item/s (<$in_crit) with `pptime $delay_in` delay"
ec_esca 2
elif [ $raw_in -lt `to_five $in_warn` ]; then
msg_append "incoming rate ${rate_in} item/s (<$in_warn) with `pptime $delay_in` delay"
ec_esca 1
else
msg_append "incoming rate ${rate_in} item/s (>$in_warn) with `pptime $delay_in` delay"
fi
else
# alert on low values
if [ $raw_in -ge `to_five $in_crit` ]; then
msg_append "incoming rate ${rate_in} item/s (>$in_crit) with `pptime $delay_in` delay"
ec_esca 2
elif [ $raw_in -gt `to_five $in_warn` ]; then
msg_append "incoming rate ${rate_in} item/s (>$in_warn) with `pptime $delay_in` delay"
ec_esca 1
else
msg_append "incoming rate ${rate_in} item/s (<=$in_warn) with `pptime $delay_in` delay"
fi
fi
echo "$msg | $perf_data"
echo "<pre>$info</pre>"
exit $ec
# syntax: check_sqs_throughput.sh <1:queue name> <2:max delay> <3:incoming warn> <4:incoming crit>
# e.g: check_sqs_throughput.sh jobs 900 20 10
# if (warn > crit) alert on high values and vise versa
define command{
command_name check_sqs_throughput
command_line $USER1$/check_sqs_throughput.sh $ARG1$ $ARG2$ $ARG3$ $ARG4$
}
# syntax: check_sqs_size.sh <1:queue name> <2:max delay> <3:size warn> <4:size crit>
# e.g: check_sqs_size.sh items 900 20 10
# if (warn > crit) alert on high values and vise versa
define command{
command_name check_sqs_size
command_line $USER1$/check_sqs_size.sh $ARG1$ $ARG2$ $ARG3$ $ARG4$
}
# syntax: check_sqs_age.sh <1:queue name> <2:max delay> <3:age sec warn> <4:age sec crit>
# e.g: check_sqs_age.sh items 900 120 300
# if (warn > crit) alert on high values and vise versa
define command{
command_name check_sqs_age
command_line $USER1$/check_sqs_age.sh $ARG1$ $ARG2$ $ARG3$ $ARG4$
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment