AWS Example - NAT Monitor Puled together code from a few different locations to make this
#!/usr/bin/env python
# vim:set sr et ts=4 sw=4 ft=python fenc=utf-8: // See Vim, :help 'modeline'
import sys
import os
import traceback
import optparse
import time
import random
from contextlib import contextmanager
import boto
from boto.vpc import VPCConnection
from threading import Thread
class pingy(Thread): # class inherits from thread
def __init__(self, ip):
Thread.__init__(self) # calls super init
self.information = ("yet to run", "no response","Active")
self.ip = ip
self.status = 0 # corresponds to the information tuple -- 0 = "yet to run"
def run(self):
pingexe = os.popen("ping -c 4 -W 1 "+self.ip, "r")
self.status = 1 #running but no response yet
while True:
line = pingexe.readline()
if not line: # if done pinging
if line.find(self.ip) and line.find('time=') > -1: # they exist:
self.status = 2 # 2=Active
def getStatus(self):
return self.information[self.status]
# Class:
class NAT_Monitoring_Info:
def __init__(self, vpcconn, ec2conn, environment):
self.ec2vpc = vpcconn
self.ec2 = ec2conn
self.env = environment
def get_broken_route_tables(self):
badRouteTables = self.ec2vpc.get_all_route_tables(filters={'route.state' : 'blackhole', 'tag:Network' : 'Private', 'tag:Env' : self.env })
return badRouteTables
def get_nat_reservations(self):
reservations = self.ec2.get_all_instances(filters={'tag:Role' : 'nat', 'tag:Env' : self.env})
return reservations
def main ():
global options, args
environment = 'production'
hostsToMonitor = []
ec2 = boto.connect_ec2()
sns = boto_connect_sns()
ec2vpc = VPCConnection()
monitoringInfo = NAT_Monitoring_Info(ec2vpc, ec2, environment)
badRouteTables = monitoringInfo.get_broken_route_tables()
natReservations = monitoringInfo.get_nat_reservations()
pingList = []
print "Pinging..."
for natInstance in natReservations:
#print "Pinging " + natInstance.instances[0].private_ip_address
current = pingy(str(natInstance.instances[0].private_ip_address))
badHosts = []
goodHosts = []
for p in pingList:
if p.getStatus() != 'Active':
elif p.getStatus() == 'Active':
# Case: There are no bad hosts or bad routes. Do nothing
if len(badRouteTables) == 0 and len(badHosts) == 0:
message = "There are no bad hosts or routes. Exiting."
sns.publish(arn:aws:sns:us-east-1:RESTOFARN, message, 'PROD NAT MONITOR :: There are no bad hosts or routes' )
return 0
# Case: There are bad hosts but no bad routes
if len(badRouteTables) == 0 and len(badHosts) > 0:
message = "There are bad hosts, but all route tables have been pointed to active NAT instances"
message += "Please revive the following NAT instances: "
for h in badHosts:
message += h
message += "Nothing else for us to do...)"
sns.publish(arn:aws:sns:us-east-1:RESTOFARN, message, 'PROD NAT MONITOR :: Everything is working but there are bad NAT instances ' )
return 1
goodHost = False
# Case: There are bad route tables and hosts - FIXIT
if len(badRouteTables) > 0 and len(badHosts) > 0:
print "There are bad routes and bad hosts. Rerouting traffic."
for badHost in badHosts:
goodHost = random.choice(goodHosts)
goodHostInstanceId = [ natInstance.instances[0].id for natInstance in natReservations if natInstance.instances[0].private_ip_address == goodHost ]
badHostInstanceId = [ natInstance.instances[0].id for natInstance in natReservations if natInstance.instances[0].private_ip_address == badHost ]
badRid = ec2vpc.get_all_route_tables(filters={'route.instance-id' : badHostInstanceId[0]})
if len(badRid) > 0:
print "\nRedirecting traffic for " + badHost + " to: " + goodHost
print "Bad host info:"
print " IP: " + badHost
print " Instance ID: " + badHostInstanceId[0]
print " Route ID: " + badRid[0].id
print "\n"
print "Good host info:"
print " IP: " + goodHost
print " Instance ID:" + goodHostInstanceId[0]
result = ec2vpc.replace_route(badRid[0].id, '', instance_id=goodHostInstanceId[0])
if result:
print "\nSuccessfully diverted traffic to " + goodHostInstanceId[0]
print "Email notification has been sent. One the NAT instance is fixed run the "
print "following comand to fix it:"
print "/opt/aws/bin/ec2-replace-route " + badRid[0].id + " -r -i " + badHostInstanceId[0]
print "Failed to redirect traffic. Exiting"
return 1
print "No bad route tables associated with this " + badHostInstanceId[0] + ":" + badHost + " - already diverted traffic"
if __name__ == '__main__':
lock_file = "/var/lock/subsys/AWS-NAT-monitor"
if os.path.exists(lock_file):
print 'Only one script can run at once. '\
'Script is locked with %s' % lock_file
open(lock_file, 'w').write("1")
start_time = time.time()
parser = optparse.OptionParser(
version='$Id: py.tpl 332 2008-10-21 22:24:52Z root $')
parser.add_option ('-v', '--verbose', action='store_true',
default=False, help='verbose output')
(options, args) = parser.parse_args()
#if len(args) < 1:
# parser.error ('missing argument')
if options.verbose: print time.asctime()
exit_code = main()
if exit_code is None:
exit_code = 0
if options.verbose: print time.asctime()
if options.verbose: print 'TOTAL TIME IN MINUTES:',
if options.verbose: print (time.time() - start_time) / 60.0
except KeyboardInterrupt, e: # Ctrl-C
raise e
except SystemExit, e: # sys.exit()
raise e
except Exception, e:
print str(e)
