Created
January 14, 2014 06:43
-
-
Save chhibber/8414154 to your computer and use it in GitHub Desktop.
AWS Example - NAT Monitor
Puled together code from a few different locations to make this
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# vim:set sr et ts=4 sw=4 ft=python fenc=utf-8: // See Vim, :help 'modeline' | |
""" | |
""" | |
import sys | |
import os | |
import traceback | |
import optparse | |
import time | |
import random | |
from contextlib import contextmanager | |
import boto | |
from boto.vpc import VPCConnection | |
from threading import Thread | |
class pingy(Thread): # class inherits from thread | |
def __init__(self, ip): | |
Thread.__init__(self) # calls super init | |
self.information = ("yet to run", "no response","Active") | |
self.ip = ip | |
self.status = 0 # corresponds to the information tuple -- 0 = "yet to run" | |
def run(self): | |
pingexe = os.popen("ping -c 4 -W 1 "+self.ip, "r") | |
self.status = 1 #running but no response yet | |
while True: | |
line = pingexe.readline() | |
if not line: # if done pinging | |
break | |
if line.find(self.ip) and line.find('time=') > -1: # they exist: | |
self.status = 2 # 2=Active | |
def getStatus(self): | |
return self.information[self.status] | |
##################################################################### | |
# Class: | |
##################################################################### | |
class NAT_Monitoring_Info: | |
def __init__(self, vpcconn, ec2conn, environment): | |
self.ec2vpc = vpcconn | |
self.ec2 = ec2conn | |
self.env = environment | |
def get_broken_route_tables(self): | |
badRouteTables = self.ec2vpc.get_all_route_tables(filters={'route.state' : 'blackhole', 'tag:Network' : 'Private', 'tag:Env' : self.env }) | |
return badRouteTables | |
def get_nat_reservations(self): | |
reservations = self.ec2.get_all_instances(filters={'tag:Role' : 'nat', 'tag:Env' : self.env}) | |
return reservations | |
def main (): | |
global options, args | |
environment = 'production' | |
hostsToMonitor = [] | |
ec2 = boto.connect_ec2() | |
sns = boto_connect_sns() | |
ec2vpc = VPCConnection() | |
monitoringInfo = NAT_Monitoring_Info(ec2vpc, ec2, environment) | |
badRouteTables = monitoringInfo.get_broken_route_tables() | |
natReservations = monitoringInfo.get_nat_reservations() | |
pingList = [] | |
print "Pinging..." | |
for natInstance in natReservations: | |
#print "Pinging " + natInstance.instances[0].private_ip_address | |
current = pingy(str(natInstance.instances[0].private_ip_address)) | |
pingList.append(current) | |
current.start() | |
badHosts = [] | |
goodHosts = [] | |
for p in pingList: | |
p.join() | |
if p.getStatus() != 'Active': | |
badHosts.append(p.ip) | |
elif p.getStatus() == 'Active': | |
goodHosts.append(p.ip) | |
# Case: There are no bad hosts or bad routes. Do nothing | |
if len(badRouteTables) == 0 and len(badHosts) == 0: | |
message = "There are no bad hosts or routes. Exiting." | |
sns.publish(arn:aws:sns:us-east-1:RESTOFARN, message, 'PROD NAT MONITOR :: There are no bad hosts or routes' ) | |
return 0 | |
# Case: There are bad hosts but no bad routes | |
if len(badRouteTables) == 0 and len(badHosts) > 0: | |
message = "There are bad hosts, but all route tables have been pointed to active NAT instances" | |
message += "Please revive the following NAT instances: " | |
for h in badHosts: | |
message += h | |
message += "Nothing else for us to do...)" | |
sns.publish(arn:aws:sns:us-east-1:RESTOFARN, message, 'PROD NAT MONITOR :: Everything is working but there are bad NAT instances ' ) | |
return 1 | |
goodHost = False | |
# Case: There are bad route tables and hosts - FIXIT | |
if len(badRouteTables) > 0 and len(badHosts) > 0: | |
print "There are bad routes and bad hosts. Rerouting traffic." | |
for badHost in badHosts: | |
goodHost = random.choice(goodHosts) | |
goodHostInstanceId = [ natInstance.instances[0].id for natInstance in natReservations if natInstance.instances[0].private_ip_address == goodHost ] | |
badHostInstanceId = [ natInstance.instances[0].id for natInstance in natReservations if natInstance.instances[0].private_ip_address == badHost ] | |
badRid = ec2vpc.get_all_route_tables(filters={'route.instance-id' : badHostInstanceId[0]}) | |
if len(badRid) > 0: | |
print "\nRedirecting traffic for " + badHost + " to: " + goodHost | |
print "Bad host info:" | |
print " IP: " + badHost | |
print " Instance ID: " + badHostInstanceId[0] | |
print " Route ID: " + badRid[0].id | |
print "\n" | |
print "Good host info:" | |
print " IP: " + goodHost | |
print " Instance ID:" + goodHostInstanceId[0] | |
result = ec2vpc.replace_route(badRid[0].id, '0.0.0.0/0', instance_id=goodHostInstanceId[0]) | |
if result: | |
print "\nSuccessfully diverted traffic to " + goodHostInstanceId[0] | |
print "Email notification has been sent. One the NAT instance is fixed run the " | |
print "following comand to fix it:" | |
print "/opt/aws/bin/ec2-replace-route " + badRid[0].id + " -r 0.0.0.0/0 -i " + badHostInstanceId[0] | |
else: | |
print "Failed to redirect traffic. Exiting" | |
return 1 | |
else: | |
print "No bad route tables associated with this " + badHostInstanceId[0] + ":" + badHost + " - already diverted traffic" | |
if __name__ == '__main__': | |
lock_file = "/var/lock/subsys/AWS-NAT-monitor" | |
if os.path.exists(lock_file): | |
print 'Only one script can run at once. '\ | |
'Script is locked with %s' % lock_file | |
sys.exit(-1) | |
else: | |
open(lock_file, 'w').write("1") | |
try: | |
start_time = time.time() | |
parser = optparse.OptionParser( | |
formatter=optparse.TitledHelpFormatter(), | |
usage=globals()['__doc__'], | |
version='$Id: py.tpl 332 2008-10-21 22:24:52Z root $') | |
parser.add_option ('-v', '--verbose', action='store_true', | |
default=False, help='verbose output') | |
(options, args) = parser.parse_args() | |
#if len(args) < 1: | |
# parser.error ('missing argument') | |
if options.verbose: print time.asctime() | |
exit_code = main() | |
if exit_code is None: | |
exit_code = 0 | |
if options.verbose: print time.asctime() | |
if options.verbose: print 'TOTAL TIME IN MINUTES:', | |
if options.verbose: print (time.time() - start_time) / 60.0 | |
except KeyboardInterrupt, e: # Ctrl-C | |
raise e | |
except SystemExit, e: # sys.exit() | |
raise e | |
except Exception, e: | |
print 'ERROR, UNEXPECTED EXCEPTION' | |
print str(e) | |
traceback.print_exc() | |
os._exit(1) | |
finally: | |
os.remove(lock_file) | |
sys.exit(exit_code) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment