Created
August 10, 2017 20:02
-
-
Save kylebrandt/f677a4047032cb458e240fc9358fbab9 to your computer and use it in GitHub Desktop.
Bosun Host Down
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
alert host.down { | |
template = host.down | |
macro = host.based.contacts | |
macro = host.generic_info | |
$notes = Both ny-bosun01 and co-tsdb01 have to agree a host is down | |
$unit_string = Bosun ping timeout | |
$query_string = "sum:bosun.ping.timeout{dst_host=*,host=ny-bosun01|co-tsdb01}" | |
$last_two = sum(t(min(rename(q($query_string, "2m", ""), "host=source,dst_host=host")), "host")) | |
$previous_four = sum(t(min(rename(q($query_string, "5m", "1m"), "host=source,dst_host=host")), "host")) | |
$five_to_seven = sum(t(max(rename(q($query_string, "7m", "5m"), "host=source,dst_host=host")), "host")) | |
# The right side of || is logic to deal with recovering. The problem is that hosts will | |
# come up and be pingable before other services are available. This makes it so we get | |
# alerts that are dependant upon this alert firing just because of reboots (false positives). | |
# min ping.time will true (1) if the pings have been timing out for the entire duration | |
# Trigger down for two, recover after up four minutes. This is a bit odd because the "recovery" | |
# logic needs to be inverted to say "still down". And the five_to_seven lets us know that it "was" | |
# down and is now up | |
crit = $last_two >= $active_dc_count || ($five_to_seven >= $active_dc_count && $previous_four) | |
#Need unjoined okay, since new things might not have the previous data yet, and then the join will fail | |
unjoinedOk = true | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment