Created
November 13, 2012 17:39
-
-
Save daniyalzade/4067207 to your computer and use it in GitHub Desktop.
Nagios Plugin For Checking Uptime in a Cluster (over multiple hosts)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Script to enure that the cluster defined by the hostgroup, or list | |
# hosts has enough healthy nodes as defined by the 'check_ping' plugin. | |
# If it has more than N number of unhealthy nodes, the plugin returns | |
# the appropriate error. | |
# | |
# To get the list of hosts from a hostgroup, this script parses nagios' | |
# hosts.cfg file which has the following format: | |
# | |
# ... | |
# define host { | |
# host_name host01.chartbeat.com | |
# address 192.168.0.1 | |
# use hostgroupname | |
# } | |
# ... | |
# requirements: | |
# envoy | |
# tornado - for option parsing | |
from collections import defaultdict | |
import envoy | |
import logging | |
import re | |
from tornado.options import define, options, parse_command_line | |
OK=0 | |
WARNING=1 | |
CRITICAL=2 | |
UNKNOWN=3 | |
HOST_PATTERN = re.compile(r'host_name\s*(\S+)\s*') | |
USE_PATTERN = re.compile(r'use\s*(\S+)\s*') | |
def _check_host(check_ping, host, ping_warning, ping_critical): | |
""" | |
@param check_ping: str, path to check_ping bin | |
@param host: str | |
@param ping_warning: str | |
@param ping_critical: str | |
@return: (status_code, output) | |
""" | |
cmd = "%s %s -w %s -c %s" % (check_ping, host, ping_warning, ping_critical) | |
logging.info("running command '%s'" % cmd) | |
try: | |
result = envoy.run(cmd) | |
logging.info("status_code: '%s', std_out: '%s'" % (result.status_code, result.std_out)) | |
return (result.status_code, result.std_out) | |
except Exception: | |
msg = "could not run command '%s'" % cmd | |
logging.warning(msg, exc_info=True) | |
return (CRITICAL, "could not run command '%s'" % cmd) | |
def _parse_hosts_file(content): | |
""" | |
Parse the Nagios hosts.cfg file and return a hostgroup -> hosts lookup | |
dict. | |
@param content: file | |
@return: dict(str(hostgroup), list(str(host))) | |
""" | |
hostname = None | |
use = None | |
hostgroups = defaultdict(list) | |
for line in content.split('\n'): | |
line = line.strip() | |
m = re.search(HOST_PATTERN, line) | |
if m: | |
hostname = m.group(1) | |
m = re.search(USE_PATTERN, line) | |
if m: | |
use = m.group(1) | |
if hostname and use: | |
hostgroups[use].append(hostname) | |
hostname = None | |
use = None | |
return hostgroups | |
def _main_helper(hosts_cfg, warning, critical, ping_warning, ping_critical, | |
check_ping=None, | |
hosts=None, | |
hostgroup=None, | |
): | |
""" | |
@param hosts_cfg: path | |
@param warning: int | |
@param critical: int | |
@param ping_warning: str | |
@param ping_critical: str | |
@param check_ping: str | |
@param hosts: list | |
@param hostsgroup: str | |
""" | |
if hosts: | |
hosts = hosts | |
else: | |
hosts_file = open(hosts_cfg).read() | |
hostgroups = _parse_hosts_file(hosts_file) | |
hosts = [] | |
for hostgroup in hostgroup.split(','): | |
cur_hosts = hostgroups[hostgroup] | |
hosts.extend(cur_hosts) | |
if not hosts: | |
msg = ("WARNING - no hostgroup '%s'. Known hostgroups: %s" % | |
(hostgroup, hostgroups.keys())) | |
logging.info(msg) | |
print msg | |
return WARNING | |
logging.info("running checks on hosts %s, warning %s, critical %s" % | |
(hosts, warning, critical)) | |
errors = [] | |
for host in hosts: | |
(status_code, std_out) = _check_host( | |
check_ping, | |
host, | |
ping_warning, | |
ping_critical, | |
) | |
if status_code: | |
errors.append((host, status_code, std_out)) | |
err_msg = '|| '.join([str(e) for e in errors]) | |
if len(errors) >= critical: | |
msg = ("ERROR - hostgroup '%s' [hosts '%s'] failing. Errors: %s, Num Errors %s/%s" % | |
(hostgroup, hosts, err_msg, len(errors), critical)) | |
logging.info(msg) | |
print msg | |
return CRITICAL | |
if len(errors) >= warning: | |
msg = ("WARNING - hostgroup '%s' [hosts '%s'] failing. Errors: %s, Num Errors %s/%s" % | |
(hostgroup, hosts, err_msg, len(errors), warning)) | |
logging.info(msg) | |
print msg | |
return WARNING | |
msg = ("OK - hostgroup '%s' [hosts '%s']. Errors: %s, Num Errors %s" % | |
(hostgroup, hosts, err_msg, len(errors))) | |
logging.info(msg) | |
print msg | |
return OK | |
def main(): | |
define('hosts_cfg', | |
help='absolute path to hosts cfg file', | |
default='/etc/nagios3/conf.d/servers/hosts.cfg', | |
) | |
define('check_ping', | |
help='absolute path to check_ping binary', | |
default='/usr/lib/nagios/plugins/check_ping', | |
) | |
define('warning', | |
help='threshold for number of hosts to be down before warning', | |
default=1, | |
type=int, | |
) | |
define('critical', | |
help='threshold for number of hosts to be down before critical', | |
default=2, | |
type=int, | |
) | |
define('ping_warning', | |
default='100.0,20%', | |
) | |
define('ping_critical', | |
default='500.0,60%', | |
) | |
define('hosts', multiple=True, | |
help='List of hosts. Mainly to be used for debugging' | |
) | |
define('hostgroup', | |
help='hostgroup to check for. If multiple hostgroups are to be added, make it a CSV' | |
) | |
parse_command_line() | |
if not options.hostgroup and not options.hosts: | |
msg = 'ERROR - either hostgroup or hosts should be passed' | |
logging.info(msg) | |
print msg | |
return CRITICAL | |
return _main_helper(options.hosts_cfg, | |
options.warning, | |
options.critical, | |
options.ping_warning, | |
options.ping_critical, | |
check_ping=options.check_ping, | |
hosts=options.hosts, | |
hostgroup=options.hostgroup, | |
) | |
if __name__ == '__main__': | |
try: | |
exit(main()) | |
except Exception: | |
msg = 'interrupted/failed' | |
logging.exception(msg) | |
print msg | |
exit(UNKNOWN) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment