Last active
October 13, 2019 17:39
-
-
Save shokoe/7fd89ad4a829133c3d1684dedee975ca to your computer and use it in GitHub Desktop.
A Nagios/Naemon script for monitoring CPU burst status for all T2 and T3 EC2 instances in an account
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
. /opt/EC2ulz/EC2ulz.sh | |
#max_age=660 | |
max_age=720 | |
balance_warn=50 | |
balance_crit=20 | |
# history of metrics to get in minutes | |
metric_time=15 | |
# Couldn't find an API for this data so it's hard coded :( | |
# This is the max CPUCreditBalanc available per instance type | |
type_max_balance='Type CPUCreditBalanceMax | |
t2.nano 72 | |
t2.micro 144 | |
t2.small 288 | |
t2.medium 576 | |
t2.large 864 | |
t2.xlarge 1296 | |
t2.2xlarge 1958.4 | |
t3.nano 144 | |
t3.micro 288 | |
t3.small 576 | |
t3.medium 576 | |
t3.large 864 | |
t3.xlarge 2304 | |
t3.2xlarge 4608 | |
t3a.nano 144 | |
t3a.micro 288 | |
t3a.small 576 | |
t3a.medium 576 | |
t3a.large 864 | |
t3a.xlarge 2304 | |
t3a.2xlarge 4608' | |
ins_tab=`Eins ID,Name,Type,State | Etul_grep running | Etul_grep 't[23]a?\.' | Etul_colrst` | |
id_list=`echo "$ins_tab" | awk 'NR!=1{print $1}' | xargs` | |
cpu_tab=`(echo "ID CpuConf"; aws ec2 describe-instance-credit-specifications --instance-ids $id_list --query 'InstanceCreditSpecifications[*].[InstanceId,CpuCredits]' --output text) |\ | |
sed 's#\t# #g' | column -t` | |
# auto_join - joins two tables vars (that has a header) with missing values default from the 2nd one, it auto sort based on join columns | |
# input - <1:1st table var> <2:2st table var> <3:1st table sort col> <4:2st table sort col> | |
count_fields(){ echo "$1" | head -1 | wc -w; } | |
auto_join(){ | |
local fields=$(eval echo 1.{1..`count_fields "$1"`} 2.{2..`count_fields "$2"`} | tr ' ' ',') | |
( | |
join -a 1 -e NA -1 ${3:-1} -2 ${4:-1} -o $fields <(echo "$1" | head -1) <(echo "$2" | head -1) | |
join -a 1 -e NA -1 ${3:-1} -2 ${4:-1} -o $fields <(echo "$1" | sed 1d | sort -k ${3:-1}) <(echo "$2" | sed 1d | sort -k ${4:-1}) | |
) | column -t | |
} | |
merge_tab=`auto_join "$ins_tab" "$cpu_tab"` | |
json_skel='{ | |
"Id": "@ID@", | |
"MetricStat": { | |
"Metric": { | |
"Namespace": "AWS/EC2", | |
"MetricName": "@METRIC@", | |
"Dimensions": [ | |
{ "Name": "InstanceId", "Value": "@INS_ID@" } | |
] | |
}, | |
"Period": 300, | |
"Stat": "Average", | |
"Unit": "Count" | |
}, | |
"ReturnData": true | |
}' | |
float_precision(){ sed 's#$# #; s# \(-\?[1-9][0-9]\+\)\.[^ ]\+ # \1 #g; s# \(-\?[1-9]\.[0-9]\)[^ ]* # \1 #g; s# \(-\?0.0*[0-9]\)[^ ]* # \1 #' | column -t; } | |
declare -A Query Answer Table | |
#for m in CPUCreditBalance; do | |
for m in CPUCreditBalance CPUSurplusCreditBalance CPUSurplusCreditsCharged; do | |
#Qruery[$m]="["$(for i in i-09b2700c8258c60fc; do | |
Query[$m]="["$(for i in $id_list; do | |
echo "$json_skel" | sed "s#@ID@#${i/-/}#; s#@METRIC@#$m#; s#@INS_ID@#$i#;" | |
echo "," | |
done | sed '$d')"]" | |
Answer[$m]=$(aws cloudwatch get-metric-data \ | |
--start-time `date -Isec -d "-${metric_time}min"` \ | |
--end-time `date -Isec` \ | |
--metric-data-queries "${Query[$m]}" \ | |
--scan-by TimestampDescending) | |
Table[$m]=`(echo "ID ${m}Age ${m}Data ${m}Diff"; | |
echo "${Answer[$m]}" | jq -r '.MetricDataResults[] | "\(.Label) \(.Timestamps[0]) \(.Values[0]) \(if (.Values | length) >= 2 then .Values[0]-.Values[1] else "NA" end)"') |\ | |
sed 's#\bnull\b#NA#g' | column -t | Etul_ager ${m}Age %0S` | |
merge_tab=`auto_join "$merge_tab" "${Table[$m]}" | (read h; echo "$h"; sort -k 2) | column -t` | |
if [ "$m" = "CPUCreditBalance" ]; then | |
merge_tab=`auto_join "$merge_tab" "$type_max_balance" 3 1 |\ | |
(read h; echo "$h CPUCreditBalancePerc"; awk '{if ($9+0==$9) {P=$7/$9*100 } else {P="NA"}; print $0, P}') | column -t` | |
fi | |
done | |
data_tab=$(echo "$merge_tab" | float_precision | (read h; echo "$h Stat"; awk -v M=$max_age -v BW=$balance_warn -v BC=$balance_crit '{ | |
Stat="ok" | |
# Stale metrics | |
if ($6>M || $11>M || $14>M) Stat="Stale" | |
# Missing any data | |
for(i=1; i<=NF; i++) { if ($i=="NA") Stat="Missing"; } | |
if ($5=="standard") { | |
# Balance alerts for limited, onyl if last value smaller the previous one | |
#if ($10 <= BW && $8 <= 0) Stat="Balance_Warn" | |
#if ($10 <= BC && $8 <= 0) Stat="Balance_Crit" | |
# without trends | |
if ($10 <= BW) Stat="Balance_Warn" | |
if ($10 <= BC) Stat="Balance_Crit" | |
} else if ($5=="unlimited") { | |
# Surplus | |
if ($12 > 0) Stat="Surplus" | |
# Charged | |
if ($15 > 0) Stat="CHARGED" | |
} | |
print $0, Stat | |
}') | column -t) | |
### Info with big table | |
info_tab=`echo "$data_tab" | sed '1{s#CPUCreditBalance#Balance#g; s#CPUSurplusCreditBalance#Surplus#g; s#CPUSurplusCreditsCharged#Charged#g; s#(age)##g;}' | awk '{$1=$4=""; print $0;}' | column -t` | |
info_tab_help="Info table fields: | |
Name Instance name tag | |
Type Instance type | |
CpuConf Instance unlimited attribute | |
BalanceAge Cloudwatch CPUCreditBalance last datapoint age in seconds | |
BalanceData Cloudwatch CPUCreditBalance last datapoint value | |
BalanceDiff Cloudwatch CPUCreditBalance last two datapoint difference (last-prev) | |
BalanceMax Max credit possible | |
BalancePerc Last datapoint as percent of max | |
SurplusAge Cloudwatch CPUSurplusCreditBalance last datapoint age in seconds | |
SurplusData Cloudwatch CPUSurplusCreditBalance last datapoint value | |
SurplusDiff Cloudwatch CPUSurplusCreditBalance last two datapoint difference (last-prev) | |
ChargedAge Cloudwatch CPUSurplusCreditsCharged last datapoint age in seconds | |
ChargedData Cloudwatch CPUSurplusCreditsCharged last datapoint value | |
ChargedDiff Cloudwatch CPUSurplusCreditsCharged last two datapoint difference (last-prev) | |
Stat Instance alert status" | |
status_help="Alert status: | |
CpuConf 'standard': | |
Balance_Warn [Warning] - BalanceDiff<0 and BalancePerc<${balance_warn} | |
Balance_Crit [Critical] - BalanceDiff<0 and BalancePerc<${balance_crit} | |
CpuConf 'unlimited': | |
Surplus [Warning] - SurplusData>0 (base cpu burst is depleted) | |
CHARGED [Critical] - ChargedData>0 (instance is accumulating additional charges) | |
Stale [Warning] - Last datapiont found is older than ${max_age}Sec | |
Missing [Warning] - less than 2 datapoint found in last $metric_time minutes" | |
infoi_full=`echo "Info table:" | |
echo "$info_tab" | sed 's#^# #' | |
echo " | |
$info_tab_help | |
$status_help"` | |
### Info with small table | |
short_tab=`echo "$info_tab" | awk 'NR==1{print $1, $2, $3, "Balance", "Balance%", "Surplus", "Charged", $15} | |
NR!=1{split("6 11 14",a," "); for (i in a) {f=a[i]; if ($f+0==$f && $f!=0) {P[f]="("$f")"} else {P[f]=""}}; print $1, $2, $3, $5P[6], $8, $10P[11], $13P[14], $15}' |\ | |
column -t` | |
short_tab_help="Info table fields: | |
Name Instance name tag | |
Type Instance type | |
CpuConf Instance unlimited attribute | |
Balance Cloudwatch CPUCreditBalance last datapoint value | |
Balance% Last datapoint as percent of maximum possible size | |
Surplus Cloudwatch CPUSurplusCreditBalance last datapoint value | |
Charged Cloudwatch CPUSurplusCreditsCharged last datapoint value | |
Stat Instance alert status | |
If there's a diff from previous mertic value is shows in parenthesis." | |
short_status_help="Alert status: | |
CpuConf 'standard': | |
Balance_Warn [Warning] - Balance-Diff<0 and Balance%<${balance_warn} | |
Balance_Crit [Critical] - Balance-Diff<0 and Balance%<${balance_crit} | |
CpuConf 'unlimited': | |
Surplus [Warning] - Surplus>0 (base cpu burst is depleted) | |
CHARGED [Critical] - Charged>0 (instance is accumulating additional charges) | |
Stale [Warning] - Last datapiont found is older than ${max_age}Sec | |
Missing [Warning] - less than 2 datapoint found in last $metric_time minutes" | |
info_short=`echo "Info table:" | |
echo "$short_tab" | sed 's#^# #' | |
echo " | |
$short_tab_help | |
$short_status_help"` | |
### Error vars and info | |
declare -A StatList StatCount | |
#stat_list=`echo "$data_tab" | awk 'NR!=1{print $NF}' | sort -u | xargs` | |
stat_list="Balance_Warn Balance_Crit Surplus CHARGED Stale Missing" | |
for S in $stat_list; do | |
StatList[$S]=`echo "$data_tab" | awk -v S=$S '$NF==S{print $2}' | xargs` | |
StatCount[$S]=`echo "${StatList[$S]}" | wc -w` | |
done | |
ins_count=`echo "$data_tab" | sed 1d | wc -l` | |
error_ins=`for S in $stat_list; do | |
[ "$S" = "ok" ] && continue | |
[ ${StatCount[$S]} -gt 0 ] && echo "$S[${StatCount[$S]}]: ${StatList[$S]}" | |
done` | |
error_line=`for S in $stat_list; do | |
[ "$S" = "ok" ] && continue | |
[ ${StatCount[$S]} -gt 0 ] && echo -n "$S=${StatCount[$S]} " | |
done` | |
### Exit code and first line | |
if [ ${StatCount["Balance_Crit"]} -gt 0 ] || [ ${StatCount["CHARGED"]} -gt 0 ]; then | |
ec=2 | |
elif [ ${StatCount["Balance_Warn"]} -gt 0 ] || [ ${StatCount["Surplus"]} -gt 0 ] || [ ${StatCount["Stale"]} -gt 0 ] || [ ${StatCount["Missing"]} -gt 0 ]; then | |
ec=1 | |
else | |
ec=0 | |
fi | |
## Info print | |
[ -z "$error_line" ] &&\ | |
echo "$ins_count instances checked, no errors found" ||\ | |
echo "$ins_count instances checked, errors found: $error_line" | |
if [ ! -z "$error_ins" ]; then | |
echo "Errors:" | |
echo "$error_ins" | sed 's#^# #' | |
echo | |
fi | |
echo "$info_short" | |
exit $ec |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
7 instances checked, errors found: Balance_Warn=1 Balance_Crit=1 Surplus=1 CHARGED=1 | |
Errors: | |
Balance_Warn[1]: Server001 | |
Balance_Crit[1]: Server002 | |
Surplus[1]: Server003 | |
CHARGED[1]: Server005 | |
Info table: | |
Name Type CpuConf Balance Balance% Surplus Charged Stat | |
Server001 t2.small standard 288(-4) 45 0 0 Balance_Warn | |
Server002 t2.medium standard 576 18 0 0 Balance_Crit | |
Server003 t3.medium unlimited 576 100 99 0 Surplus | |
Server005 t3a.nano unlimited 144 100 0 55 CHARGED | |
Server006 t2.medium standard 576 100 0 0 ok | |
Server007 t2.small standard 287(-0.5) 99 0 0 ok | |
Server008 t2.medium standard 253(12) 44 0 0 ok | |
Info table fields: | |
Name Instance name tag | |
Type Instance type | |
CpuConf Instance unlimited attribute | |
Balance Cloudwatch CPUCreditBalance last datapoint value | |
Balance% Last datapoint as percent of maximum possible size | |
Surplus Cloudwatch CPUSurplusCreditBalance last datapoint value | |
Charged Cloudwatch CPUSurplusCreditsCharged last datapoint value | |
Stat Instance alert status | |
If there's a diff from previous mertic value is shows in parenthesis. | |
Alert status: | |
CpuConf 'standard': | |
Balance_Warn [Warning] - Balance-Diff<0 and Balance%<50 | |
Balance_Crit [Critical] - Balance-Diff<0 and Balance%<20 | |
CpuConf 'unlimited': | |
Surplus [Warning] - Surplus>0 (base cpu burst is depleted) | |
CHARGED [Critical] - Charged>0 (instance is accumulating additional charges) | |
Stale [Warning] - Last datapiont found is older than 720Sec | |
Missing [Warning] - less than 2 datapoint found in last 15 minutes |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
7 instances checked, errors found: Balance_Warn=1 Balance_Crit=1 Surplus=1 CHARGED=1 | |
Errors: | |
Balance_Warn[1]: Server001 | |
Balance_Crit[1]: Server002 | |
Surplus[1]: Server003 | |
CHARGED[1]: Server005 | |
Info table: | |
Name Type CpuConf BalanceAge BalanceData BalanceDiff BalanceMax BalancePerc SurplusAge SurplusData SurplusDiff ChargedAge ChargedData ChargedDiff Stat | |
Server001 t2.small standard 354 280 -4 288 45 355 0 0 356 0 0 Balance_Warn | |
Server002 t2.medium standard 354 576 0 576 18 355 0 0 356 0 0 Balance_Crit | |
Server003 t3.medium unlimited 354 576 0 576 100 355 99 0 356 0 0 Surplus | |
Server005 t3a.nano unlimited 354 144 0 144 100 355 0 0 356 55 0 CHARGED | |
Server006 t2.medium standard 354 576 0 576 100 355 0 0 356 0 0 ok | |
Server007 t2.medium standard 354 575 -0.5 576 99 355 0 0 356 0 0 ok | |
Server008 t2.medium standard 354 253 12 576 44 355 0 0 356 0 0 ok | |
Info table fields: | |
Name Instance name tag | |
Type Instance type | |
CpuConf Instance unlimited attribute | |
BalanceAge Cloudwatch CPUCreditBalance last datapoint age in seconds | |
BalanceData Cloudwatch CPUCreditBalance last datapoint value | |
BalanceDiff Cloudwatch CPUCreditBalance last two datapoint difference (last-prev) | |
BalanceMax Max credit possible | |
BalancePerc Last datapoint as percent of max | |
SurplusAge Cloudwatch CPUSurplusCreditBalance last datapoint age in seconds | |
SurplusData Cloudwatch CPUSurplusCreditBalance last datapoint value | |
SurplusDiff Cloudwatch CPUSurplusCreditBalance last two datapoint difference (last-prev) | |
ChargedAge Cloudwatch CPUSurplusCreditsCharged last datapoint age in seconds | |
ChargedData Cloudwatch CPUSurplusCreditsCharged last datapoint value | |
ChargedDiff Cloudwatch CPUSurplusCreditsCharged last two datapoint difference (last-prev) | |
Stat Instance alert status | |
Alert status: | |
CpuConf 'standard': | |
Balance_Warn [Warning] - BalanceDiff<0 and BalancePerc<50 | |
Balance_Crit [Critical] - BalanceDiff<0 and BalancePerc<20 | |
CpuConf 'unlimited': | |
Surplus [Warning] - SurplusData>0 (base cpu burst is depleted) | |
CHARGED [Critical] - ChargedData>0 (instance is accumulating additional charges) | |
Stale [Warning] - Last datapiont found is older than 720Sec | |
Missing [Warning] - less than 2 datapoint found in last 15 minutes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment