Last active
April 3, 2022 18:16
-
-
Save rogerbush8/3ebcedfdc295c1a248cc to your computer and use it in GitHub Desktop.
nat-heartbeat-failover-monitor-script-for-aws
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# aws-instance-monitor | |
# | |
# This is a command-line tool, as well as a monitor/failover script that is designed | |
# for HA NAT, but should be usable when the following conditions are met: | |
# | |
# 1. Used in AWS VPC Route Table - VPC routing describes methods of egress | |
# for outbound traffic. Typically, a NAT will have a 0.0.0.0/0 rule | |
# and the route will be in a table associated with a Private Subnet. | |
# Thus, all Internet traffic (0.0.0.0/0) leaving from the Private Subnet | |
# is routed over the user instance (NAT). There are other user instances | |
# that go into the routing table (e.g. IPsec tunnel). These should work | |
# also, as this mechanism just switches the routing by substituting a | |
# different (healthy) user instance into each of the routes which reference | |
# the (unhealthy) user instance. | |
# 2. Pair of instances - they monitor each other with ping check. Split | |
# network is possible, and not handled by this simple script (nor can | |
# it be handled, completely, by two scripts on two boxes). A split | |
# network might see the NATs competing for the route (flipping back and | |
# forth at the failover rate). | |
# 3. The monitor script is designed to be run on both of the instances in | |
# the pair. | |
# | |
# Ideas for this python script came from Jinesh Varia's BASH script and | |
# article on HA NAT instances in VPC: http://aws.amazon.com/articles/2781451301784570 | |
# | |
# Commands: | |
# | |
# There are 3 commands: show-affected-routes, swap-instance, run-failover-monitor | |
# | |
# show-affected-routes essentially takes 3 search criteria: --env, --region, and --instance. | |
# The --env is given by an optional AWS Tag on the RouteTable. It may be ignored by | |
# passing in --env "" on the command-line. This is used, for example, to tag "prod" and | |
# "dev" environments, as an additional check to make sure we are not affecting production. | |
# show-affected-routes will select routes that match the --env, --region (AWS regions, | |
# e.g. 'us-east-1') and --instance (AWS instanceId of the NAT which will be swapped out | |
# for a new instanceId, e.g. 'i-301223ca'). | |
# | |
# swap-instance uses show-affected-routes to find the routes that reference the instance | |
# to swap. It then makes a call to ec2-change-route for each route, and changes the | |
# instance. It does not do any checking on the CIDR number (e.g. 0.0.0.0/0), it merely | |
# keeps this CIDR number in place. This allows the script to be used, for example, | |
# for IPsec tunnel gateways. | |
# | |
# run-failover-monitor starts this script up in an infinite loop which performs a | |
# ping check on the peer instance. When the peer instance becomes unreachable, the | |
# script invokes the swap-instance action to swap this instance in as the healthy | |
# instance. | |
# | |
# Dependencies: | |
# | |
# This script is designed to have as few dependencies as possible. All of the code is in | |
# this single script, except for common Python libraries that are installed. In addition | |
# the script has the following dependencies: | |
# | |
# 1. AWS cli must be installed - the commands called are ec2-change-route and ec2-describe-routes. | |
# 2. Each instance must have the ability to successfully make the ec2-change-route and ec2-describe-routes | |
# command. | |
# 3. ping is used for the ping test | |
# 4. curl is used to fetch our InstanceId when we run in monitor mode. | |
# | |
import sys | |
import os | |
import subprocess | |
import json | |
import copy | |
import argparse | |
import glob | |
import imp | |
import re | |
import time | |
from datetime import date | |
from datetime import datetime | |
class App : | |
def __init__ (self) : | |
self.__init_aws_env () | |
# This function executes the aws-apitools-common.sh and sets the resulting | |
# shell environment variables locally. If this is not done, the appropriate | |
# permissions are not set for aws calls on the role | |
def __init_aws_env (self) : | |
# Execute bash script and read the variables into Python | |
command = ['bash', '-c', '. /etc/profile.d/aws-apitools-common.sh && env'] | |
proc = subprocess.Popen (command, stdout = subprocess.PIPE) | |
for line in proc.stdout : | |
(key, _, value) = line.partition ("=") | |
os.environ [key] = value | |
proc.communicate () | |
# Flattens AWS tags in RouteTables to a map (from a list of n,v items) | |
def __fixup_route_tags (self, data) : | |
for rt in data ['RouteTables'] : | |
tags = { } | |
for item in rt ['Tags'] : | |
k = item ['Key'] | |
v = item ['Value'] | |
tags [k] = v | |
rt ['Tags'] = tags | |
# Execute a command in a subshell, saving stdout and stderr, and potentially throwing an error | |
# (on rc != 0). Will use error_map (symbol : regex) to match against stderr, returning symbol | |
# on a match. result object is : | |
# { out : stdout, err : stderr, rc : returncode, symbol : err symbol match } | |
def __exec_cmd (self, cmd, error_map={}, throws=False) : | |
proc = subprocess.Popen ([ cmd ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) | |
(out, err) = proc.communicate () | |
# strip off leading "\n" from err if exists | |
if err.find ("\n") == 0 : | |
err = err [1:] | |
x = { 'out' : out, 'err' : err, 'rc' : proc.returncode, 'symbol' : None } | |
err = x ['err'] | |
if x ['rc'] != 0 and len (err) : | |
for symbol in error_map.keys () : | |
pattern = error_map [symbol] | |
m = re.search (pattern, err) | |
if m : | |
x ['symbol'] = symbol | |
if x ['rc'] == 0 or not throws: | |
return x | |
raise Exception (x ['symbol'], x ['rc'], x ['err']) | |
# Get routes for a region in json format. Output is altered: Tags flattened to dict | |
def __fetch_routes_raw (self, region) : | |
x = self.__exec_cmd ("aws ec2 describe-route-tables --region {0} --output json".format (region)) | |
data = json.loads (x ['out']) | |
self.__fixup_route_tags (data) | |
return data | |
# Get instance. Output is altered: return just the instance part from describe-instances | |
def __describe_instance_unwrapped (self, region, instance_id) : | |
x = self.__exec_cmd ("aws ec2 describe-instances --region {0} --output json --instance-ids {1}".format (region, instance_id)) | |
data = json.loads (x ['out']) | |
data2 = data ['Reservations'][0]['Instances'] | |
obj = data2 [0] | |
if instance_id != obj ['InstanceId'] : | |
return None | |
return obj | |
# Start instance | |
def __start_instance (self, region, instance_id) : | |
x = self.__exec_cmd ("aws ec2 start-instances --region {0} --output json --instance-ids {1}".format (region, instance_id)) | |
data = json.loads (x ['out']) | |
return data | |
# Stop instance | |
def __stop_instance (self, region, instance_id) : | |
x = self.__exec_cmd ("aws ec2 stop-instances --region {0} --output json --instance-ids {1}".format (region, instance_id)) | |
data = json.loads (x ['out']) | |
return data | |
# rtb_cidr is an array (or None) of pairs of this format <rtb-routeTableInstanceId>,<dest-cidr>. | |
# These represent routes in a particular table. We will return a table of objects = { routeTableInstanceId: 'x', cidr: 'y' } | |
def __validate_and_format_rtb_cidr (self, rtb_cidr) : | |
if rtb_cidr == None : | |
return [ ] | |
tmp = [ ] | |
for s in rtb_cidr : | |
m = re.match (r"(rtb-.*),(\d+[.]\d+[.]\d+[.]\d+[/]\d+)", s) | |
if not m : | |
raise Exception ("--rtb_cidr did not match regex (must be rtb-id,x.x.x.x/x)") | |
tmp.append ( { 'routeTableInstanceId' : m.group (1), 'cidr' : m.group (2) }) | |
return tmp | |
# Output formatted log (timestamp, message) to stderr | |
def log (self, message) : | |
ts = datetime.today () | |
print >> sys.stderr, "{0} {1} ".format (ts.strftime ("%y-%m-%d %H:%M:%S"), message) | |
# Get contents at URL (uses curl) | |
def get_url (self, url) : | |
x = self.__exec_cmd ("curl -s {0}".format (url)) | |
return x | |
# Returns array of route records, augmented with parent data, which have InstanceId == instance_id in region | |
def find_routes_with_instance (self, env, region, instance) : | |
data = self.__fetch_routes_raw (region) | |
rows = [ ] | |
for rt in data ['RouteTables'] : | |
routeTableId = rt ['RouteTableId'] | |
vpcId = rt ['VpcId'] | |
if 'Name' in rt ['Tags'] : | |
routeTableName = rt ['Tags']['Name'] | |
if 'Env' in rt ['Tags'] : | |
routeTableEnv = rt ['Tags']['Env'] | |
if '__INSTANCE_PREFS' in rt ['Tags'] : | |
routeTableEnv = rt ['Tags']['__INSTANCE_PREFS'] | |
if env and routeTableEnv != env : | |
continue | |
for r in rt ['Routes'] : | |
if not 'InstanceId' in r : | |
continue | |
id = r ['InstanceId'] | |
# Append row with all data, and augmented data | |
if id == instance : | |
out = copy.copy (r) | |
out ['RouteTableId'] = routeTableId | |
out ['VpcId'] = vpcId | |
out ['RouteTableName'] = routeTableName | |
out ['RouteTableEnv'] = routeTableEnv | |
rows.append (out) | |
return rows | |
# Command - outputs (stdout) json of routes that have instance (InstanceId). Output is altered: | |
# augmented information, such as Tag ['Name'] | |
def show_routes_with_instance (self, env, region, instance) : | |
rows = self.find_routes_with_instance (env, region, instance) | |
json.dump (rows, sys.stdout, indent=4) | |
# Replace instance_id in existing route. Used to swap healthy instance for unhealthy | |
def replace_route (self, region, to_instance, route_table_id, dest_cidr, dry_run=False) : | |
cmd = "aws ec2 replace-route --route-table-id {0} --destination-cidr-block {1} --instance-id {2} --region {3}".format ( | |
route_table_id, dest_cidr, to_instance, region) | |
try : | |
if not dry_run : | |
error_map = { 'InvalidInstanceId' : 'InvalidInstanceID.NotFound', | |
'InvalidRoute' : r"no route defined.*CreateRoute", | |
} | |
out = self.__exec_cmd (cmd, error_map, throws=True) | |
return { 'ok' : True, 'symbol' : None, 'rc' : 0, 'err' : None } | |
else : | |
self.log ("DRY RUN {0}".format (cmd)) | |
except Exception as e : | |
return { 'ok' : False, 'symbol' : e [0], 'rc' : e [1], 'err' : e [2] } | |
# Create route. Used if route is not detected on startup to initialize. A monitoring instance | |
# "owns" routes specified by --rtb_cidr on startup, and these are initialized as the | |
# recovery sequence. | |
def create_route (self, region, to_instance, route_table_id, dest_cidr, dry_run=False) : | |
cmd = "aws ec2 create-route --route-table-id {0} --destination-cidr-block {1} --instance-id {2} --region {3}".format ( | |
route_table_id, dest_cidr, to_instance, region) | |
try : | |
if not dry_run : | |
error_map = { 'InvalidInstanceId' : 'InvalidInstanceID.NotFound', | |
'InvalidRoute' : r"no route defined.*CreateRoute", | |
} | |
out = self.__exec_cmd (cmd, error_map, throws=True) | |
return { 'ok' : True, 'symbol' : None, 'rc' : 0, 'err' : None } | |
else : | |
self.log ("DRY RUN {0}".format (cmd)) | |
except Exception as e : | |
return { 'ok' : False, 'symbol' : e [0], 'rc' : e [1], 'err' : e [2] } | |
# Finds all routes that have the instance and change InstanceId to to_instance. Used to swap | |
# all unhealthy for healthy in one method | |
def swap_routes_with_instance (self, env, region, instance, to_instance, dry_run=False) : | |
rows = self.find_routes_with_instance (env, region, instance) | |
for r in rows : | |
res = self.replace_route (region, to_instance, r ['RouteTableId'], r ['DestinationCidrBlock'], dry_run) | |
# This method implements an infinite loop, which is the failover monitor process. If this server | |
# detects a problem with the peer server (using ping), it will replace the routes with the unhealthy | |
# server's InstanceId with it's own, stop the unhealthy server, and restart it. When the unhealthy | |
# server comes up, as part of the initialization, it will reset the routes it owns to its own id. | |
def run_failover_monitor (self, env, region, instance, rtb_cidr=None, ip=None, to_instance=None, dry_run=False, verbose=False) : | |
dry_run_pre = "DRY RUN " if dry_run else "" | |
self.log ("AWS Peered Instance Failover Monitor Starting...") | |
self.log ("Peered instance to monitor is AWS instance = {0}".format (instance)) | |
# Fetch to_instance (my instanceId) from AWS or use --to_instance | |
if not to_instance : | |
self.log ("Fetching my instance id from AWS (curl)...") | |
x = self.get_url ("http://169.254.169.254/latest/meta-data/instance-id") | |
to_instance = x ['out'] | |
if to_instance == "" : | |
self.log ("Unable to automatically determine my AWS instanceId (script must be running on an AWS instance)") | |
exit (1) | |
else : | |
self.log ("Using my instance id passed in from command-line arg --to_instance") | |
self.log ("My AWS InstanceId = {0}".format (to_instance)) | |
# Deal with route Table Instance Ids and Cidrs | |
arr_rtb_cidr = self.__validate_and_format_rtb_cidr (rtb_cidr) | |
if len (arr_rtb_cidr) == 0 : | |
self.log ("No --rtb_cidr(s) specified, so this instance acts as a 'hot idle spare'") | |
else : | |
self.log ("Starting up and setting --rtb_cidr(s) to have us as the instance (i.e. for recovery).") | |
# Set all of my owned routes (specified by rtb_cidr) to myself (recover on startup) | |
for x in arr_rtb_cidr : | |
routeTableInstanceId = x ['routeTableInstanceId'] | |
cidr = x ['cidr'] | |
msg = "{0}Startup Recovery: setting route {1} {2} {3} with my instance_id = {4}".format ( | |
dry_run_pre, region, routeTableInstanceId, cidr, to_instance) | |
self.log (msg) | |
if not dry_run : | |
out = self.replace_route (region, to_instance, routeTableInstanceId, cidr, dry_run) | |
if not out ['ok'] : | |
if out ['symbol'] == 'InvalidRoute' : | |
self.log ("Route doesn't exist, creating...") | |
out = self.create_route (region, to_instance, routeTableInstanceId, cidr, dry_run) | |
if not out ['ok'] : | |
self.log ("Problem with creating route. Continuing... err={0}".format (out ['err'])) | |
else : | |
self.log ("Problem with replacing route. Possible bad startup params, pleast check and fix. Continuing...") | |
# Fetch IP from AWS or use --ip passed in | |
if not ip : | |
self.log ("Fetching Private IP for peer from AWS...") | |
data = self.__describe_instance_unwrapped (region, instance) | |
if not data : | |
self.log ("ERROR: Couldn't find Private IP for peer instance {0}".format (instance)) | |
else : | |
if not 'PrivateIpAddress' in data : | |
self.log ("ERROR: Peer instance {0} has no PrivateIpAddress!".format (instance)) | |
else : | |
ip = data ['PrivateIpAddress'] | |
else : | |
self.log ("Using peer IP addressed passed in with --ip") | |
if not ip : | |
self.log ("ERROR: Couldn't determine Peer IP for pingtest. Aborting...") | |
exit (1) | |
else : | |
self.log ("Peer IP for ping test = {0}".format (ip)) | |
self.log ("AWS Peered Instance Failover Monitor Initialized and Running...") | |
num_pings = 3 | |
wait_between_pings = 2 | |
wait_for_instance_stop=60 | |
wait_for_instance_start=300 | |
done = False | |
i = 0 | |
healthy_ping_count = 0 | |
while not done : | |
i = i + 1 | |
# N.B. -W ping_timeout omitted as this seems to fail on the Mac | |
cmd = "ping -c {0} {1} | grep time= | wc -l".format (num_pings, ip) | |
x = self.__exec_cmd (cmd) | |
# Healthy | |
ping_count = int (x ['out']) | |
if ping_count > 0 : | |
healthy_ping_count = healthy_ping_count + 1 | |
if verbose : | |
self.log ("Pingtest SUCCESS {0} for Peer at {1}".format (healthy_ping_count, ip)) | |
time.sleep (wait_between_pings) | |
continue | |
else : | |
self.log ("FAILURE DETECTED - Pingtest failed for Peer at {1}".format (healthy_ping_count, ip)) | |
healthy_ping_count = 0 | |
# Unhealthy | |
unhealthy_routes = self.find_routes_with_instance (env, region, instance) | |
nat_healthy = False | |
stopping_nat = False | |
while not nat_healthy : | |
self.log ("RECOVERY INITIATED") | |
# Swap routes | |
num_unhealthy_routes = len (unhealthy_routes) | |
if num_unhealthy_routes : | |
msg = "Found {0} unhealthy routes with instance {1}. Swapping " + \ | |
"InstanceId to instance {2}..." | |
self.log (msg.format (num_unhealthy_routes, instance, to_instance)) | |
self.swap_routes_with_instance (env, region, instance, to_instance, dry_run) | |
self.log ("Unhealthy routes swapped to instance {0}".format (to_instance)) | |
# For now, let's say it works | |
unhealthy_routes = [ ] | |
else : | |
self.log ("No unhealthy routes found with instance {0} in {1} {2}".format (instance, env, region)) | |
self.log ("Checking state for {0} ...".format (ip)) | |
data = self.__describe_instance_unwrapped (region, instance) | |
state = data ['State']['Name'] | |
self.log ("State for {0}, State = {1}".format (ip, state)) | |
if state == 'stopped' : | |
self.log ("Instance {0} at {1} stopped, restarting...".format (instance, ip)) | |
if not dry_run : | |
self.__start_instance (region, instance) | |
nat_healthy = True | |
self.log ("Waiting {0} seconds for restart of Instance {1}...".format (wait_for_instance_start, instance)) | |
time.sleep (wait_for_instance_start) | |
else : | |
if not stopping_nat : | |
self.log ("Instance {0} at {1} is not stopped, so stopping...".format (instance, ip)) | |
if not dry_run : | |
self.__stop_instance (region, instance) | |
stopping_nat = True | |
self.log ("Waiting {0} seconds for Instance {1} to stop...".format (wait_for_instance_stop, instance)) | |
time.sleep (wait_for_instance_stop) | |
def main () : | |
app = App () | |
parser = argparse.ArgumentParser () | |
subparsers = parser.add_subparsers (help="aws-nat-monitor is a command-line tool and an " + \ | |
"automated failover monitor process. It supports several commands useful for NAT (or other) " + \ | |
"failover, as well as a command that sets it into failover monitor mode.\nCommands:") | |
# Subparser: swap-nat | |
swap_instance = subparsers.add_parser ('swap-instance', | |
help="Swap the existing (NAT) instance (--instance) (AWS instance-id, e.g. i-301223ca) to a different " + \ | |
"(healthy NAT) instance (--to_instance) in every route that we find the instance listed. Performs an " + \ | |
"ec2-replace-route on potentially many routes in a single --region.\nExample: " + \ | |
"aws-instance-monitor swap-instance --env-prod --region us-east-1 --instance i-301223ca --to_instance i-57684fad") | |
swap_instance.set_defaults (func=app.swap_routes_with_instance) | |
# Subparser: swap_instance, options | |
swap_instance.add_argument ('--dry_run', action='store_true', | |
help="Run command in no write mode as a test") | |
swap_instance.add_argument ('--env', type=str, required=True, | |
help="env (Tag 'Env' on Route Table) to be affected (e.g. prod, dev)") | |
swap_instance.add_argument ('--region', type=str, required=True, | |
help="AWS region to be affected (e.g. us-east-1, us-west-2)") | |
swap_instance.add_argument ("--instance", type=str, required=True, | |
help="AWS InstanceId (of NAT) to SWAP") | |
swap_instance.add_argument ("--to_instance", type=str, required=True, | |
help="AWS InstanceId (of NAT) to change to") | |
# Subparser: show-affected-routes | |
show_affected = subparsers.add_parser ('show-affected-routes', | |
help="Shows metadata for routes that have the --instance. These will be routes that " + \ | |
"would be affected by the 'swap-nat' --instance command.\nExample: " + \ | |
"aws-nat-monitor show-affected-routes --env prod --region us-east-1 --instance i-301223ca") | |
show_affected.set_defaults (func=app.show_routes_with_instance) | |
# Subparser: show-affected-routes, options | |
show_affected.add_argument ('--env', type=str, required=True, | |
help="env (Tag 'Env' on Route Table) to be affected (e.g. prod, dev). Use '' empty string to specify 'any' env.") | |
show_affected.add_argument ('--region', type=str, required=True, | |
help="AWS region to be affected (e.g. us-east-1, us-west-2)") | |
show_affected.add_argument ("--instance", type=str, required=True, | |
help="AWS InstanceId (of NAT) to change") | |
run_failover = subparsers.add_parser ('run-failover-monitor', | |
help="Starts this script up as a failover monitor, that uses a pingcheck to determine the " + \ | |
"availability of a peered machine, which also should be running the same program, with " + \ | |
"this machine as the peer. When this script determines the peer is unhealthy, it invokes " + \ | |
"the 'swap-nat' method, which changes all the entries in the routing tables which " + \ | |
"reference the unhealthy server, to this server. This implements a hot/hot NAT failover system. " + \ | |
"The switching technique and this script is general enough to be used by any user instance, " + \ | |
"acting as a gateway.\nExample: aws-instance-monitor run-failover-monitor --instance i-57684fad" + \ | |
"--region us-east-1 --env prod") | |
run_failover.set_defaults (func=app.run_failover_monitor) | |
# Subparser: run-failover-monitor, options | |
run_failover.add_argument ('--dry_run', action='store_true', | |
help="Run command in no write mode as a test") | |
run_failover.add_argument ('--verbose', action='store_true', | |
help="More output (e.g. ping test results)") | |
run_failover.add_argument ('--env', type=str, required=True, | |
help="env (Tag 'Env' on Route Table) to be affected (e.g. prod, dev). Use '' empty string to specify 'any' env.") | |
run_failover.add_argument ('--region', type=str, required=True, | |
help="AWS region to be affected (e.g. us-east-1, us-west-2)") | |
run_failover.add_argument ("--instance", type=str, required=True, | |
help="AWS InstanceId of peer (NAT) to monitor") | |
run_failover.add_argument ("--rtb_cidr", type=str, nargs='+', | |
help="1 to N route table and CIDR (dest), of format <rtb>,<cidr> (e.g. rtb-be2005db,0.0.0.0/0), each represents " + \ | |
"a route that the monitor 'owns' (will set instance to itself on startup)") | |
run_failover.add_argument ("--to_instance", type=str, | |
help="AWS InstanceId of this server (NAT) to change to (will be fetched from AWS if not specified).") | |
run_failover.add_argument ("--ip", type=str, | |
help="AWS InstanceId of this server (NAT) to change to (will be fetched from AWS if not specified).") | |
args = parser.parse_args () | |
# Convert argparse namespace to dict, remove func from args | |
arg_dict = vars (args) | |
func = arg_dict ['func'] | |
del arg_dict ['func'] | |
# Call by unpacking dict to function call | |
func (**arg_dict) | |
if __name__ == '__main__' : | |
sys.exit (main ()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment