Skip to content

Instantly share code, notes, and snippets.

@shokoe
Last active October 13, 2019 17:39
Show Gist options
  • Save shokoe/7fd89ad4a829133c3d1684dedee975ca to your computer and use it in GitHub Desktop.
Save shokoe/7fd89ad4a829133c3d1684dedee975ca to your computer and use it in GitHub Desktop.
A Nagios/Naemon script for monitoring CPU burst status for all T2 and T3 EC2 instances in an account
#!/bin/bash
. /opt/EC2ulz/EC2ulz.sh
#max_age=660
max_age=720
balance_warn=50
balance_crit=20
# history of metrics to get in minutes
metric_time=15
# Couldn't find an API for this data so it's hard coded :(
# This is the max CPUCreditBalanc available per instance type
type_max_balance='Type CPUCreditBalanceMax
t2.nano 72
t2.micro 144
t2.small 288
t2.medium 576
t2.large 864
t2.xlarge 1296
t2.2xlarge 1958.4
t3.nano 144
t3.micro 288
t3.small 576
t3.medium 576
t3.large 864
t3.xlarge 2304
t3.2xlarge 4608
t3a.nano 144
t3a.micro 288
t3a.small 576
t3a.medium 576
t3a.large 864
t3a.xlarge 2304
t3a.2xlarge 4608'
ins_tab=`Eins ID,Name,Type,State | Etul_grep running | Etul_grep 't[23]a?\.' | Etul_colrst`
id_list=`echo "$ins_tab" | awk 'NR!=1{print $1}' | xargs`
cpu_tab=`(echo "ID CpuConf"; aws ec2 describe-instance-credit-specifications --instance-ids $id_list --query 'InstanceCreditSpecifications[*].[InstanceId,CpuCredits]' --output text) |\
sed 's#\t# #g' | column -t`
# auto_join - joins two tables vars (that has a header) with missing values default from the 2nd one, it auto sort based on join columns
# input - <1:1st table var> <2:2st table var> <3:1st table sort col> <4:2st table sort col>
count_fields(){ echo "$1" | head -1 | wc -w; }
auto_join(){
local fields=$(eval echo 1.{1..`count_fields "$1"`} 2.{2..`count_fields "$2"`} | tr ' ' ',')
(
join -a 1 -e NA -1 ${3:-1} -2 ${4:-1} -o $fields <(echo "$1" | head -1) <(echo "$2" | head -1)
join -a 1 -e NA -1 ${3:-1} -2 ${4:-1} -o $fields <(echo "$1" | sed 1d | sort -k ${3:-1}) <(echo "$2" | sed 1d | sort -k ${4:-1})
) | column -t
}
merge_tab=`auto_join "$ins_tab" "$cpu_tab"`
json_skel='{
"Id": "@ID@",
"MetricStat": {
"Metric": {
"Namespace": "AWS/EC2",
"MetricName": "@METRIC@",
"Dimensions": [
{ "Name": "InstanceId", "Value": "@INS_ID@" }
]
},
"Period": 300,
"Stat": "Average",
"Unit": "Count"
},
"ReturnData": true
}'
float_precision(){ sed 's#$# #; s# \(-\?[1-9][0-9]\+\)\.[^ ]\+ # \1 #g; s# \(-\?[1-9]\.[0-9]\)[^ ]* # \1 #g; s# \(-\?0.0*[0-9]\)[^ ]* # \1 #' | column -t; }
declare -A Query Answer Table
#for m in CPUCreditBalance; do
for m in CPUCreditBalance CPUSurplusCreditBalance CPUSurplusCreditsCharged; do
#Qruery[$m]="["$(for i in i-09b2700c8258c60fc; do
Query[$m]="["$(for i in $id_list; do
echo "$json_skel" | sed "s#@ID@#${i/-/}#; s#@METRIC@#$m#; s#@INS_ID@#$i#;"
echo ","
done | sed '$d')"]"
Answer[$m]=$(aws cloudwatch get-metric-data \
--start-time `date -Isec -d "-${metric_time}min"` \
--end-time `date -Isec` \
--metric-data-queries "${Query[$m]}" \
--scan-by TimestampDescending)
Table[$m]=`(echo "ID ${m}Age ${m}Data ${m}Diff";
echo "${Answer[$m]}" | jq -r '.MetricDataResults[] | "\(.Label) \(.Timestamps[0]) \(.Values[0]) \(if (.Values | length) >= 2 then .Values[0]-.Values[1] else "NA" end)"') |\
sed 's#\bnull\b#NA#g' | column -t | Etul_ager ${m}Age %0S`
merge_tab=`auto_join "$merge_tab" "${Table[$m]}" | (read h; echo "$h"; sort -k 2) | column -t`
if [ "$m" = "CPUCreditBalance" ]; then
merge_tab=`auto_join "$merge_tab" "$type_max_balance" 3 1 |\
(read h; echo "$h CPUCreditBalancePerc"; awk '{if ($9+0==$9) {P=$7/$9*100 } else {P="NA"}; print $0, P}') | column -t`
fi
done
data_tab=$(echo "$merge_tab" | float_precision | (read h; echo "$h Stat"; awk -v M=$max_age -v BW=$balance_warn -v BC=$balance_crit '{
Stat="ok"
# Stale metrics
if ($6>M || $11>M || $14>M) Stat="Stale"
# Missing any data
for(i=1; i<=NF; i++) { if ($i=="NA") Stat="Missing"; }
if ($5=="standard") {
# Balance alerts for limited, onyl if last value smaller the previous one
#if ($10 <= BW && $8 <= 0) Stat="Balance_Warn"
#if ($10 <= BC && $8 <= 0) Stat="Balance_Crit"
# without trends
if ($10 <= BW) Stat="Balance_Warn"
if ($10 <= BC) Stat="Balance_Crit"
} else if ($5=="unlimited") {
# Surplus
if ($12 > 0) Stat="Surplus"
# Charged
if ($15 > 0) Stat="CHARGED"
}
print $0, Stat
}') | column -t)
### Info with big table
info_tab=`echo "$data_tab" | sed '1{s#CPUCreditBalance#Balance#g; s#CPUSurplusCreditBalance#Surplus#g; s#CPUSurplusCreditsCharged#Charged#g; s#(age)##g;}' | awk '{$1=$4=""; print $0;}' | column -t`
info_tab_help="Info table fields:
Name Instance name tag
Type Instance type
CpuConf Instance unlimited attribute
BalanceAge Cloudwatch CPUCreditBalance last datapoint age in seconds
BalanceData Cloudwatch CPUCreditBalance last datapoint value
BalanceDiff Cloudwatch CPUCreditBalance last two datapoint difference (last-prev)
BalanceMax Max credit possible
BalancePerc Last datapoint as percent of max
SurplusAge Cloudwatch CPUSurplusCreditBalance last datapoint age in seconds
SurplusData Cloudwatch CPUSurplusCreditBalance last datapoint value
SurplusDiff Cloudwatch CPUSurplusCreditBalance last two datapoint difference (last-prev)
ChargedAge Cloudwatch CPUSurplusCreditsCharged last datapoint age in seconds
ChargedData Cloudwatch CPUSurplusCreditsCharged last datapoint value
ChargedDiff Cloudwatch CPUSurplusCreditsCharged last two datapoint difference (last-prev)
Stat Instance alert status"
status_help="Alert status:
CpuConf 'standard':
Balance_Warn [Warning] - BalanceDiff<0 and BalancePerc<${balance_warn}
Balance_Crit [Critical] - BalanceDiff<0 and BalancePerc<${balance_crit}
CpuConf 'unlimited':
Surplus [Warning] - SurplusData>0 (base cpu burst is depleted)
CHARGED [Critical] - ChargedData>0 (instance is accumulating additional charges)
Stale [Warning] - Last datapiont found is older than ${max_age}Sec
Missing [Warning] - less than 2 datapoint found in last $metric_time minutes"
infoi_full=`echo "Info table:"
echo "$info_tab" | sed 's#^# #'
echo "
$info_tab_help
$status_help"`
### Info with small table
short_tab=`echo "$info_tab" | awk 'NR==1{print $1, $2, $3, "Balance", "Balance%", "Surplus", "Charged", $15}
NR!=1{split("6 11 14",a," "); for (i in a) {f=a[i]; if ($f+0==$f && $f!=0) {P[f]="("$f")"} else {P[f]=""}}; print $1, $2, $3, $5P[6], $8, $10P[11], $13P[14], $15}' |\
column -t`
short_tab_help="Info table fields:
Name Instance name tag
Type Instance type
CpuConf Instance unlimited attribute
Balance Cloudwatch CPUCreditBalance last datapoint value
Balance% Last datapoint as percent of maximum possible size
Surplus Cloudwatch CPUSurplusCreditBalance last datapoint value
Charged Cloudwatch CPUSurplusCreditsCharged last datapoint value
Stat Instance alert status
If there's a diff from previous mertic value is shows in parenthesis."
short_status_help="Alert status:
CpuConf 'standard':
Balance_Warn [Warning] - Balance-Diff<0 and Balance%<${balance_warn}
Balance_Crit [Critical] - Balance-Diff<0 and Balance%<${balance_crit}
CpuConf 'unlimited':
Surplus [Warning] - Surplus>0 (base cpu burst is depleted)
CHARGED [Critical] - Charged>0 (instance is accumulating additional charges)
Stale [Warning] - Last datapiont found is older than ${max_age}Sec
Missing [Warning] - less than 2 datapoint found in last $metric_time minutes"
info_short=`echo "Info table:"
echo "$short_tab" | sed 's#^# #'
echo "
$short_tab_help
$short_status_help"`
### Error vars and info
declare -A StatList StatCount
#stat_list=`echo "$data_tab" | awk 'NR!=1{print $NF}' | sort -u | xargs`
stat_list="Balance_Warn Balance_Crit Surplus CHARGED Stale Missing"
for S in $stat_list; do
StatList[$S]=`echo "$data_tab" | awk -v S=$S '$NF==S{print $2}' | xargs`
StatCount[$S]=`echo "${StatList[$S]}" | wc -w`
done
ins_count=`echo "$data_tab" | sed 1d | wc -l`
error_ins=`for S in $stat_list; do
[ "$S" = "ok" ] && continue
[ ${StatCount[$S]} -gt 0 ] && echo "$S[${StatCount[$S]}]: ${StatList[$S]}"
done`
error_line=`for S in $stat_list; do
[ "$S" = "ok" ] && continue
[ ${StatCount[$S]} -gt 0 ] && echo -n "$S=${StatCount[$S]} "
done`
### Exit code and first line
if [ ${StatCount["Balance_Crit"]} -gt 0 ] || [ ${StatCount["CHARGED"]} -gt 0 ]; then
ec=2
elif [ ${StatCount["Balance_Warn"]} -gt 0 ] || [ ${StatCount["Surplus"]} -gt 0 ] || [ ${StatCount["Stale"]} -gt 0 ] || [ ${StatCount["Missing"]} -gt 0 ]; then
ec=1
else
ec=0
fi
## Info print
[ -z "$error_line" ] &&\
echo "$ins_count instances checked, no errors found" ||\
echo "$ins_count instances checked, errors found: $error_line"
if [ ! -z "$error_ins" ]; then
echo "Errors:"
echo "$error_ins" | sed 's#^# #'
echo
fi
echo "$info_short"
exit $ec
7 instances checked, errors found: Balance_Warn=1 Balance_Crit=1 Surplus=1 CHARGED=1
Errors:
Balance_Warn[1]: Server001
Balance_Crit[1]: Server002
Surplus[1]: Server003
CHARGED[1]: Server005
Info table:
Name Type CpuConf Balance Balance% Surplus Charged Stat
Server001 t2.small standard 288(-4) 45 0 0 Balance_Warn
Server002 t2.medium standard 576 18 0 0 Balance_Crit
Server003 t3.medium unlimited 576 100 99 0 Surplus
Server005 t3a.nano unlimited 144 100 0 55 CHARGED
Server006 t2.medium standard 576 100 0 0 ok
Server007 t2.small standard 287(-0.5) 99 0 0 ok
Server008 t2.medium standard 253(12) 44 0 0 ok
Info table fields:
Name Instance name tag
Type Instance type
CpuConf Instance unlimited attribute
Balance Cloudwatch CPUCreditBalance last datapoint value
Balance% Last datapoint as percent of maximum possible size
Surplus Cloudwatch CPUSurplusCreditBalance last datapoint value
Charged Cloudwatch CPUSurplusCreditsCharged last datapoint value
Stat Instance alert status
If there's a diff from previous mertic value is shows in parenthesis.
Alert status:
CpuConf 'standard':
Balance_Warn [Warning] - Balance-Diff<0 and Balance%<50
Balance_Crit [Critical] - Balance-Diff<0 and Balance%<20
CpuConf 'unlimited':
Surplus [Warning] - Surplus>0 (base cpu burst is depleted)
CHARGED [Critical] - Charged>0 (instance is accumulating additional charges)
Stale [Warning] - Last datapiont found is older than 720Sec
Missing [Warning] - less than 2 datapoint found in last 15 minutes
7 instances checked, errors found: Balance_Warn=1 Balance_Crit=1 Surplus=1 CHARGED=1
Errors:
Balance_Warn[1]: Server001
Balance_Crit[1]: Server002
Surplus[1]: Server003
CHARGED[1]: Server005
Info table:
Name Type CpuConf BalanceAge BalanceData BalanceDiff BalanceMax BalancePerc SurplusAge SurplusData SurplusDiff ChargedAge ChargedData ChargedDiff Stat
Server001 t2.small standard 354 280 -4 288 45 355 0 0 356 0 0 Balance_Warn
Server002 t2.medium standard 354 576 0 576 18 355 0 0 356 0 0 Balance_Crit
Server003 t3.medium unlimited 354 576 0 576 100 355 99 0 356 0 0 Surplus
Server005 t3a.nano unlimited 354 144 0 144 100 355 0 0 356 55 0 CHARGED
Server006 t2.medium standard 354 576 0 576 100 355 0 0 356 0 0 ok
Server007 t2.medium standard 354 575 -0.5 576 99 355 0 0 356 0 0 ok
Server008 t2.medium standard 354 253 12 576 44 355 0 0 356 0 0 ok
Info table fields:
Name Instance name tag
Type Instance type
CpuConf Instance unlimited attribute
BalanceAge Cloudwatch CPUCreditBalance last datapoint age in seconds
BalanceData Cloudwatch CPUCreditBalance last datapoint value
BalanceDiff Cloudwatch CPUCreditBalance last two datapoint difference (last-prev)
BalanceMax Max credit possible
BalancePerc Last datapoint as percent of max
SurplusAge Cloudwatch CPUSurplusCreditBalance last datapoint age in seconds
SurplusData Cloudwatch CPUSurplusCreditBalance last datapoint value
SurplusDiff Cloudwatch CPUSurplusCreditBalance last two datapoint difference (last-prev)
ChargedAge Cloudwatch CPUSurplusCreditsCharged last datapoint age in seconds
ChargedData Cloudwatch CPUSurplusCreditsCharged last datapoint value
ChargedDiff Cloudwatch CPUSurplusCreditsCharged last two datapoint difference (last-prev)
Stat Instance alert status
Alert status:
CpuConf 'standard':
Balance_Warn [Warning] - BalanceDiff<0 and BalancePerc<50
Balance_Crit [Critical] - BalanceDiff<0 and BalancePerc<20
CpuConf 'unlimited':
Surplus [Warning] - SurplusData>0 (base cpu burst is depleted)
CHARGED [Critical] - ChargedData>0 (instance is accumulating additional charges)
Stale [Warning] - Last datapiont found is older than 720Sec
Missing [Warning] - less than 2 datapoint found in last 15 minutes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment