Created
June 1, 2019 04:56
-
-
Save arrdem/27aa6eadefb6d0538c189755546c5ccc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Evil monitoring. | |
Ping hosts, syslogging at INFO if they're up and happy, otherwise using Telnet scripting to force | |
reset them and syslogging at CRIT with what the uptime was prior to forced reboot. | |
Hosts are debounced, so that they have a chance to return before monitoring resumes. | |
No effort is made to detect network conditions or poweroffs. | |
""" | |
from os import kill, getpid | |
import multiprocessing | |
import subprocess | |
import syslog | |
from datetime import datetime, timedelta | |
from time import sleep | |
import signal | |
from sys import exit | |
from telnetlib import Telnet | |
CONFIG = { | |
# APC PDU credentials | |
"pdu_username": "REDACTED", | |
"pdu_password": "REDACTED", | |
# Hosts recover in about 40s, | |
# But only stop responding to pings for about 6-8s. | |
"debounce": 40, | |
# Once a host is up, 5s of no ping is indicative. | |
"threshold": 5, | |
# (hostname: PDU port) pairs | |
"hosts": { | |
"logos": "2", | |
"ethos": "3", | |
"pathos": "4", | |
# "ketos": "5", | |
} | |
} | |
def log(level, msg): | |
print("{} @ {}] {}".format(getpid(), level, msg)) | |
syslog.syslog(level, msg) | |
def zdec(i: int): | |
"""Decrement, stopping at 0.""" | |
if i <= 1: | |
return 0 | |
else: | |
return i - 1 | |
def ping(hostname: str, | |
count: int = 2, | |
timeout: int = 1): | |
"""Send count packets to a hostname, with a timeout of timeout""" | |
try: | |
return subprocess.check_call(["ping", "-q", "-c", str(count), "-W", str(timeout), hostname], | |
stderr=subprocess.DEVNULL, | |
stdout=subprocess.DEVNULL) == 0 | |
except subprocess.CalledProcessError: | |
return False | |
__reboot_lock__ = multiprocessing.Lock() | |
def do_reboot(port: str): | |
"""Get a shared lock, telnet to sucker, reset the port and log out.""" | |
def l(text): | |
return (text + "\r").encode("utf-8") | |
def apc_login(conn): | |
conn.read_until(b"User Name") | |
conn.write(l(CONFIG['pdu_username'])) | |
conn.read_until(b"Password") | |
conn.write(l(CONFIG['pdu_password'])) | |
def apc_command(conn, cmd): | |
conn.read_until(b"APC>") | |
conn.write(l(cmd)) | |
# To ensure only one process logs into the PDU at once | |
with __reboot_lock__: | |
conn = Telnet('sucker', 23) | |
apc_login(conn) | |
apc_command(conn, "reboot " + port) | |
apc_command(conn, "quit") | |
conn.close() | |
def monitor(hostname: str, port: str): | |
# Set a signal handler for shutdown | |
def _sigint(_signum, _frame): | |
print("monitor for {hostname} shutting down...".format(hostname=hostname)) | |
exit(0) | |
signal.signal(signal.SIGINT, _sigint) | |
# Do the work | |
log(syslog.LOG_INFO, "Monitoring {hostname}".format(hostname=hostname)) | |
threshold = CONFIG["threshold"] | |
debounce = timedelta(seconds=CONFIG["debounce"]) | |
# Outer loop - never exits just restores state | |
while True: | |
start = datetime.today() | |
counter = 0 | |
# Inner loop - a single monitoring round terminated in a restart | |
while True: | |
now = datetime.today() | |
delta = now - start | |
if delta < debounce: | |
pass | |
elif counter >= threshold: | |
# Bounce the box, wait for it to become healthy again | |
uptime = delta.total_seconds() - counter | |
log(syslog.LOG_ALERT, "{hostname} detected unhealthy for {counter}s after {uptime}s up, forcing reboot!".format(**locals())) | |
do_reboot(port) | |
# Break into the outer loop, resetting state | |
break | |
elif not ping(hostname): | |
# If the hostname is unhealthy, we increment its "bad" score | |
log(syslog.LOG_WARNING, "{hostname} detected unhealthy ({counter} of {threshold})".format(**locals())) | |
counter += 1 | |
else: | |
# Otherwise we zdec the score. | |
counter = zdec(counter) | |
# delta > debounce implied by if ordering | |
if delta.total_seconds() % (60 * 5) // 1 == 0: | |
log(syslog.LOG_INFO, "{} healthy for {}s".format(hostname, delta.total_seconds())) | |
sleep(1) | |
if __name__ == "__main__": | |
processes = [] | |
def stop_processes(_signum: int, _frame): | |
for p in processes: | |
kill(p.pid, signal.SIGINT) | |
for p in processes: | |
p.join() | |
signal.signal(signal.SIGINT, stop_processes) | |
for hostname, port in CONFIG["hosts"].items(): | |
p = multiprocessing.Process(target=monitor, args=(hostname, port)) | |
processes.append(p) | |
p.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment