Skip to content

Instantly share code, notes, and snippets.

@chhibber
Created January 14, 2014 06:43
Show Gist options
  • Save chhibber/8414154 to your computer and use it in GitHub Desktop.
Save chhibber/8414154 to your computer and use it in GitHub Desktop.
AWS Example - NAT Monitor Puled together code from a few different locations to make this
#!/usr/bin/env python
# vim:set sr et ts=4 sw=4 ft=python fenc=utf-8: // See Vim, :help 'modeline'
"""
"""
import sys
import os
import traceback
import optparse
import time
import random
from contextlib import contextmanager
import boto
from boto.vpc import VPCConnection
from threading import Thread
class pingy(Thread): # class inherits from thread
def __init__(self, ip):
Thread.__init__(self) # calls super init
self.information = ("yet to run", "no response","Active")
self.ip = ip
self.status = 0 # corresponds to the information tuple -- 0 = "yet to run"
def run(self):
pingexe = os.popen("ping -c 4 -W 1 "+self.ip, "r")
self.status = 1 #running but no response yet
while True:
line = pingexe.readline()
if not line: # if done pinging
break
if line.find(self.ip) and line.find('time=') > -1: # they exist:
self.status = 2 # 2=Active
def getStatus(self):
return self.information[self.status]
#####################################################################
# Class:
#####################################################################
class NAT_Monitoring_Info:
def __init__(self, vpcconn, ec2conn, environment):
self.ec2vpc = vpcconn
self.ec2 = ec2conn
self.env = environment
def get_broken_route_tables(self):
badRouteTables = self.ec2vpc.get_all_route_tables(filters={'route.state' : 'blackhole', 'tag:Network' : 'Private', 'tag:Env' : self.env })
return badRouteTables
def get_nat_reservations(self):
reservations = self.ec2.get_all_instances(filters={'tag:Role' : 'nat', 'tag:Env' : self.env})
return reservations
def main ():
global options, args
environment = 'production'
hostsToMonitor = []
ec2 = boto.connect_ec2()
sns = boto_connect_sns()
ec2vpc = VPCConnection()
monitoringInfo = NAT_Monitoring_Info(ec2vpc, ec2, environment)
badRouteTables = monitoringInfo.get_broken_route_tables()
natReservations = monitoringInfo.get_nat_reservations()
pingList = []
print "Pinging..."
for natInstance in natReservations:
#print "Pinging " + natInstance.instances[0].private_ip_address
current = pingy(str(natInstance.instances[0].private_ip_address))
pingList.append(current)
current.start()
badHosts = []
goodHosts = []
for p in pingList:
p.join()
if p.getStatus() != 'Active':
badHosts.append(p.ip)
elif p.getStatus() == 'Active':
goodHosts.append(p.ip)
# Case: There are no bad hosts or bad routes. Do nothing
if len(badRouteTables) == 0 and len(badHosts) == 0:
message = "There are no bad hosts or routes. Exiting."
sns.publish(arn:aws:sns:us-east-1:RESTOFARN, message, 'PROD NAT MONITOR :: There are no bad hosts or routes' )
return 0
# Case: There are bad hosts but no bad routes
if len(badRouteTables) == 0 and len(badHosts) > 0:
message = "There are bad hosts, but all route tables have been pointed to active NAT instances"
message += "Please revive the following NAT instances: "
for h in badHosts:
message += h
message += "Nothing else for us to do...)"
sns.publish(arn:aws:sns:us-east-1:RESTOFARN, message, 'PROD NAT MONITOR :: Everything is working but there are bad NAT instances ' )
return 1
goodHost = False
# Case: There are bad route tables and hosts - FIXIT
if len(badRouteTables) > 0 and len(badHosts) > 0:
print "There are bad routes and bad hosts. Rerouting traffic."
for badHost in badHosts:
goodHost = random.choice(goodHosts)
goodHostInstanceId = [ natInstance.instances[0].id for natInstance in natReservations if natInstance.instances[0].private_ip_address == goodHost ]
badHostInstanceId = [ natInstance.instances[0].id for natInstance in natReservations if natInstance.instances[0].private_ip_address == badHost ]
badRid = ec2vpc.get_all_route_tables(filters={'route.instance-id' : badHostInstanceId[0]})
if len(badRid) > 0:
print "\nRedirecting traffic for " + badHost + " to: " + goodHost
print "Bad host info:"
print " IP: " + badHost
print " Instance ID: " + badHostInstanceId[0]
print " Route ID: " + badRid[0].id
print "\n"
print "Good host info:"
print " IP: " + goodHost
print " Instance ID:" + goodHostInstanceId[0]
result = ec2vpc.replace_route(badRid[0].id, '0.0.0.0/0', instance_id=goodHostInstanceId[0])
if result:
print "\nSuccessfully diverted traffic to " + goodHostInstanceId[0]
print "Email notification has been sent. One the NAT instance is fixed run the "
print "following comand to fix it:"
print "/opt/aws/bin/ec2-replace-route " + badRid[0].id + " -r 0.0.0.0/0 -i " + badHostInstanceId[0]
else:
print "Failed to redirect traffic. Exiting"
return 1
else:
print "No bad route tables associated with this " + badHostInstanceId[0] + ":" + badHost + " - already diverted traffic"
if __name__ == '__main__':
lock_file = "/var/lock/subsys/AWS-NAT-monitor"
if os.path.exists(lock_file):
print 'Only one script can run at once. '\
'Script is locked with %s' % lock_file
sys.exit(-1)
else:
open(lock_file, 'w').write("1")
try:
start_time = time.time()
parser = optparse.OptionParser(
formatter=optparse.TitledHelpFormatter(),
usage=globals()['__doc__'],
version='$Id: py.tpl 332 2008-10-21 22:24:52Z root $')
parser.add_option ('-v', '--verbose', action='store_true',
default=False, help='verbose output')
(options, args) = parser.parse_args()
#if len(args) < 1:
# parser.error ('missing argument')
if options.verbose: print time.asctime()
exit_code = main()
if exit_code is None:
exit_code = 0
if options.verbose: print time.asctime()
if options.verbose: print 'TOTAL TIME IN MINUTES:',
if options.verbose: print (time.time() - start_time) / 60.0
except KeyboardInterrupt, e: # Ctrl-C
raise e
except SystemExit, e: # sys.exit()
raise e
except Exception, e:
print 'ERROR, UNEXPECTED EXCEPTION'
print str(e)
traceback.print_exc()
os._exit(1)
finally:
os.remove(lock_file)
sys.exit(exit_code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment