Created
September 25, 2017 08:54
-
-
Save shokoe/2867f86880624ed8f85fb9eba5e49128 to your computer and use it in GitHub Desktop.
Nagios SQS checks for age, size and throughput
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# syntax: check_sqs_age.sh <1:queue name> <2:max delay> <3:age sec warn> <4:age sec crit> | |
# e.g: check_sqs_age.sh items 900 120 300 | |
# if (warn > crit) alert on high values and vise versa | |
queue_name=$1 | |
max_delay=$2 | |
warn=$3 | |
crit=$4 | |
base_delay=360 | |
msg_append(){ | |
if [ -z "$msg" ]; then | |
msg="$1" | |
else | |
msg="$msg, $1" | |
fi | |
} | |
ec_esca(){ [ ${ec:-0} -lt $1 ] && ec=$1; } | |
pptime(){ | |
if [ $1 -gt 129600 ]; then | |
echo "$(($1/86400))d$(($1%86400/3600))h$(($1%3600/60))m" | |
elif [ $1 -gt 5400 ]; then | |
echo "$(($1/3600))h$(($1%3600/60))m" | |
elif [ $1 -gt 120 ]; then | |
echo "$(($1/60))m$(($1%60))s" | |
else | |
echo "$(($1))s" | |
fi | |
} | |
get_cw(){ | |
unset o | |
o=$(aws cloudwatch get-metric-statistics --namespace AWS/SQS \ | |
--metric-name $1 \ | |
--dimensions Name=QueueName,Value=$queue_name \ | |
--start-time $((`date +%s`-(300+$base_delay))) --end-time $((`date +%s`-$base_delay)) \ | |
--period 300 --statistics Sum --output text | sed 1d) | |
if [ ! -z "$o" ]; then | |
read D m t u <<< "$o" | |
echo ${m/.*/} $((`date +%s`-`date -d "$t" +%s`)) | |
else | |
echo "NA NA" | |
fi | |
} | |
read age delay <<< "`get_cw ApproximateAgeOfOldestMessage`" | |
perf_data="age:$age; delay:$delay;" | |
### check delays | |
################ | |
if [ ${delay} -gt $max_delay ]; then | |
msg_append "delay too big (`pptime $delay`>`pptime $max_delay`)" | |
ec_esca 2 | |
fi | |
### check incoming rate | |
####################### | |
if [ "$rate_in" = "NA" ]; then | |
msg_append "unable to get incoming rate for $queue_name" | |
ec_esca 2 | |
elif [ $warn -gt $crit ]; then | |
# alert on high values | |
if [ $age -lt $crit ]; then | |
msg_append "oldest item age `pptime $age` (<`pptime $crit`) with `pptime $delay` delay" | |
ec_esca 2 | |
elif [ $age -lt $warn ]; then | |
msg_append "oldest item age `pptime $age` (<`pptime $warn`) with `pptime $delay` delay" | |
ec_esca 1 | |
else | |
msg_append "oldest item age `pptime $age` (>`pptime $warn`) with `pptime $delay` delay" | |
fi | |
else | |
# alert on low values | |
if [ $age -gt $crit ]; then | |
msg_append "oldest item age `pptime $age` (>`pptime $crit`) with `pptime $delay` delay" | |
ec_esca 2 | |
elif [ $age -gt $warn ]; then | |
msg_append "oldest item age `pptime $age` (>`pptime $warn`) with `pptime $delay` delay" | |
ec_esca 1 | |
else | |
msg_append "oldest item age `pptime $age` (<=`pptime $warn`) with `pptime $delay` delay" | |
fi | |
fi | |
echo "$msg | $perf_data" | |
#echo "<pre>$info</pre>" | |
exit $ec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# syntax: check_sqs_size.sh <1:queue name> <2:max delay> <3:size warn> <4:size crit> | |
# e.g: check_sqs_size.sh items 900 20 10 | |
# if (warn > crit) alert on high values and vise versa | |
queue_name=$1 | |
max_delay=$2 | |
warn=$3 | |
crit=$4 | |
base_delay=360 | |
msg_append(){ | |
if [ -z "$msg" ]; then | |
msg="$1" | |
else | |
msg="$msg, $1" | |
fi | |
} | |
ec_esca(){ [ ${ec:-0} -lt $1 ] && ec=$1; } | |
pptime(){ | |
if [ $1 -gt 129600 ]; then | |
echo "$(($1/86400))d$(($1%86400/3600))h$(($1%3600/60))m" | |
elif [ $1 -gt 5400 ]; then | |
echo "$(($1/3600))h$(($1%3600/60))m" | |
elif [ $1 -gt 120 ]; then | |
echo "$(($1/60))m$(($1%60))s" | |
else | |
echo "$(($1))s" | |
fi | |
} | |
get_cw(){ | |
unset o | |
o=$(aws cloudwatch get-metric-statistics --namespace AWS/SQS \ | |
--metric-name $1 \ | |
--dimensions Name=QueueName,Value=$queue_name \ | |
--start-time $((`date +%s`-(300+$base_delay))) --end-time $((`date +%s`-$base_delay)) \ | |
--period 300 --statistics Sum --output text | sed 1d) | |
if [ ! -z "$o" ]; then | |
read D m t u <<< "$o" | |
echo ${m/.*/} $((`date +%s`-`date -d "$t" +%s`)) | |
else | |
echo "NA NA" | |
fi | |
} | |
read size delay <<< "`get_cw ApproximateNumberOfMessagesVisible`" | |
perf_data="size:$size; delay:$delay;" | |
### check delays | |
################ | |
if [ ${delay} -gt $max_delay ]; then | |
msg_append "delay too big (`pptime $delay`>`pptime $max_delay`)" | |
ec_esca 2 | |
fi | |
### check incoming rate | |
####################### | |
if [ "$rate_in" = "NA" ]; then | |
msg_append "unable to get incoming rate for $queue_name" | |
ec_esca 2 | |
elif [ $warn -gt $crit ]; then | |
# alert on high values | |
if [ $size -lt $crit ]; then | |
msg_append "size $size (<$crit) with `pptime $delay` delay" | |
ec_esca 2 | |
elif [ $size -lt $warn ]; then | |
msg_append "size $size (<$warn) with `pptime $delay` delay" | |
ec_esca 1 | |
else | |
msg_append "size $size (>$warn) with `pptime $delay` delay" | |
fi | |
else | |
# alert on low values | |
if [ $size -gt $crit ]; then | |
msg_append "size $size (>$crit) with `pptime $delay` delay" | |
ec_esca 2 | |
elif [ $size -gt $warn ]; then | |
msg_append "size $size (>$warn) with `pptime $delay` delay" | |
ec_esca 1 | |
else | |
msg_append "size $size (<=$warn) with `pptime $delay` delay" | |
fi | |
fi | |
echo "$msg | $perf_data" | |
#echo "<pre>$info</pre>" | |
exit $ec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# syntax: check_sqs_throughput.sh <1:queue name> <2:max delay> <3:incoming warn> <4:incoming crit> | |
# e.g: check_sqs_throughput.sh jobs 900 20 10 | |
# if (warn > crit) alert on high values and vise versa | |
queue_name=$1 | |
max_delay=$2 | |
in_warn=$3 | |
in_crit=$4 | |
base_delay=360 | |
msg_append(){ | |
if [ -z "$msg" ]; then | |
msg="$1" | |
else | |
msg="$msg, $1" | |
fi | |
} | |
ec_esca(){ [ ${ec:-0} -lt $1 ] && ec=$1; } | |
pptime(){ | |
if [ $1 -gt 129600 ]; then | |
echo "$(($1/86400))d$(($1%86400/3600))h$(($1%3600/60))m" | |
elif [ $1 -gt 5400 ]; then | |
echo "$(($1/3600))h$(($1%3600/60))m" | |
elif [ $1 -gt 120 ]; then | |
echo "$(($1/60))m$(($1%60))s" | |
else | |
echo "$(($1))s" | |
fi | |
} | |
to_sec(){ awk -v a=$1 'BEGIN{printf "%0.2f", a/300}' | sed 's#\.00$##'; } | |
to_five(){ awk -v a=$1 'BEGIN{printf "%d", a*300}'; } | |
get_cw(){ | |
unset o | |
o=$(aws cloudwatch get-metric-statistics --namespace AWS/SQS \ | |
--metric-name $1 \ | |
--dimensions Name=QueueName,Value=$queue_name \ | |
--start-time $((`date +%s`-(300+$base_delay))) --end-time $((`date +%s`-$base_delay)) \ | |
--period 300 --statistics Sum --output text | sed 1d) | |
if [ ! -z "$o" ]; then | |
read D m t u <<< "$o" | |
#echo ${m/.*/} $((${m/.*/}/300)) $((`date +%s`-`date -d "$t" +%s`)) | |
echo ${m/.*/} `to_sec $m` $((`date +%s`-`date -d "$t" +%s`)) | |
else | |
echo "NA" | |
fi | |
} | |
read raw_out rate_out delay_out <<< "`get_cw NumberOfMessagesReceived`" | |
read raw_in rate_in delay_in <<< "`get_cw NumberOfMessagesSent`" | |
read raw_del rate_del delay_del <<< "`get_cw NumberOfMessagesDeleted`" | |
perf_data=$(for z in ${!rate_*} ${!delay_*}; do echo -n "$z:${!z}; "; done | |
echo -n "lost:`to_sec $(($raw_del-$raw_out))`;") | |
info=$(echo "metric item/s delay delay(s) | |
out $rate_out `pptime $delay_out` $delay_out | |
in $rate_in `pptime $delay_in` $delay_in | |
del $rate_del `pptime $delay_del` $delay_del" | column -t) | |
info="$info | |
lost items: `to_sec $(($raw_del-$raw_out))`" | |
### check delays | |
################ | |
delays_err=0 | |
for d in ${!delay_*}; do | |
n=${d/delay_/} | |
if [ ${!d} -gt $max_delay ]; then | |
delays_list="$delay_list $n" | |
delays_err=1 | |
fi | |
#echo "$d($n):${!d}" | |
done | |
if (($delays_err)); then | |
msg_append "delays too big (${delays_list/ /}>$max_delay)" | |
ec_esca 2 | |
fi | |
### check lost | |
############## | |
if [ `echo $(($delay_out-$delay_del)) | sed 's#-##'` -gt 30 ] && [ $(($raw_del-$raw_out)) -gt 0 ]; then | |
msg_append "found `to_sec $(($raw_del-$raw_out))` lost items" | |
ec_esca 2 | |
fi | |
### check incoming rate | |
####################### | |
if [ "$rate_in" = "NA" ]; then | |
msg_append "unable to get incoming rate for $queue_name" | |
ec_esca 2 | |
elif [ `to_five $in_warn` -ge `to_five $in_crit` ]; then | |
# alert on high values | |
#if [ $rate_in -lt $in_crit ]; then | |
if [ $raw_in -lt `to_five $in_crit` ]; then | |
msg_append "incoming rate ${rate_in} item/s (<$in_crit) with `pptime $delay_in` delay" | |
ec_esca 2 | |
elif [ $raw_in -lt `to_five $in_warn` ]; then | |
msg_append "incoming rate ${rate_in} item/s (<$in_warn) with `pptime $delay_in` delay" | |
ec_esca 1 | |
else | |
msg_append "incoming rate ${rate_in} item/s (>$in_warn) with `pptime $delay_in` delay" | |
fi | |
else | |
# alert on low values | |
if [ $raw_in -ge `to_five $in_crit` ]; then | |
msg_append "incoming rate ${rate_in} item/s (>$in_crit) with `pptime $delay_in` delay" | |
ec_esca 2 | |
elif [ $raw_in -gt `to_five $in_warn` ]; then | |
msg_append "incoming rate ${rate_in} item/s (>$in_warn) with `pptime $delay_in` delay" | |
ec_esca 1 | |
else | |
msg_append "incoming rate ${rate_in} item/s (<=$in_warn) with `pptime $delay_in` delay" | |
fi | |
fi | |
echo "$msg | $perf_data" | |
echo "<pre>$info</pre>" | |
exit $ec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# syntax: check_sqs_throughput.sh <1:queue name> <2:max delay> <3:incoming warn> <4:incoming crit> | |
# e.g: check_sqs_throughput.sh jobs 900 20 10 | |
# if (warn > crit) alert on high values and vise versa | |
define command{ | |
command_name check_sqs_throughput | |
command_line $USER1$/check_sqs_throughput.sh $ARG1$ $ARG2$ $ARG3$ $ARG4$ | |
} | |
# syntax: check_sqs_size.sh <1:queue name> <2:max delay> <3:size warn> <4:size crit> | |
# e.g: check_sqs_size.sh items 900 20 10 | |
# if (warn > crit) alert on high values and vise versa | |
define command{ | |
command_name check_sqs_size | |
command_line $USER1$/check_sqs_size.sh $ARG1$ $ARG2$ $ARG3$ $ARG4$ | |
} | |
# syntax: check_sqs_age.sh <1:queue name> <2:max delay> <3:age sec warn> <4:age sec crit> | |
# e.g: check_sqs_age.sh items 900 120 300 | |
# if (warn > crit) alert on high values and vise versa | |
define command{ | |
command_name check_sqs_age | |
command_line $USER1$/check_sqs_age.sh $ARG1$ $ARG2$ $ARG3$ $ARG4$ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment