Skip to content

Instantly share code, notes, and snippets.

@davidglvn
Created July 2, 2018 13:29
Show Gist options
  • Save davidglvn/89d0b1e79f570f9b6e6bb0085bda1faf to your computer and use it in GitHub Desktop.
Save davidglvn/89d0b1e79f570f9b6e6bb0085bda1faf to your computer and use it in GitHub Desktop.
Failover from spot to on-demand group
import boto3
from os import getenv
from json import loads
# AutoScalingGroups Region
REGION = getenv("REGION", "us-east-1")
# Name of On-Demand failover AutoScalingGroup
ON_DEMAND_ASG = getenv("ON_DEMAND_ASG", None)
# Name of Spot AutoScalingGroup
SPOT_ASG = getenv("SPOT_ASG", None)
if not ON_DEMAND_ASG or not SPOT_ASG:
raise Exception("Spot and/or On-Demand AutoScalingGroups names are not " +
"defined")
client = boto3.client('autoscaling', region_name=REGION)
def main(event, context):
"""Handle AWS Lambda.
Function that recieves Cloudwatch state change notification(via SNS). If
the state changed to `Alarm` it will do failover from Spot to On-Demand(
Move policies from Spot ASG to On-Demand ASG). If the state is `Ok` it will
bring down On-Demand instances and set Spot ASG as main(move scale
policies back to Spot).
Parameters
----------
event : dict
AWS Lamabda event.
context : dict
AWS Lamabda context.
Returns
-------
None
"""
print("Executing Lambda Failover function")
alarm = loads(event["Records"][0]["Sns"]["Message"])
if alarm["NewStateValue"] == "ALARM":
print("'{}' Spots are down, doing a failover to '{}'".
format(SPOT_ASG, ON_DEMAND_ASG))
failover_asg(SPOT_ASG, ON_DEMAND_ASG)
elif alarm["NewStateValue"] == "OK":
print("'{}' Spots are back, rolling back the failover from '{}'".
format(SPOT_ASG, ON_DEMAND_ASG))
failover_asg(ON_DEMAND_ASG, SPOT_ASG)
set_asg_desire(ON_DEMAND_ASG, 0)
print('Failover is done')
def failover_asg(source_asg, failover_asg):
"""Switch between to AutoScalingGroups.
Usefull for failover from spot to on-demand and back.
Parameters
----------
source_asg : string
Name of AutoScalingGroup to switch from.
failover_asg : string
Name of AutoScalingGroup to switch to.
Returns
-------
None
"""
# get info about ASGs
asgs = describe_asgs([source_asg, failover_asg])
# get source ASG policeis
source_policies = describe_asg_policies(source_asg)
# update failover ASG to same size as source ASG
set_asg_desire(failover_asg, asgs[source_asg]["DesiredCapacity"])
# attach policies from source asg to failover asg
attach_asg_policies(failover_asg, source_policies)
def describe_asgs(asgs):
"""Get information about AutoScalingGroups.
Can retrieve information about multiple AutoScalingGroups.
Parameters
----------
asgs : list
AutoScalingGroups names list.
Returns
-------
dict
Dictionary with AutoScalingGroups information, where each key represent
AutoScalingGroup.
"""
response = client.describe_auto_scaling_groups(
AutoScalingGroupNames=asgs,
MaxRecords=len(asgs)
)
response_asgs = {}
for asg in response["AutoScalingGroups"]:
response_asgs[asg["AutoScalingGroupName"]] = asg
return response_asgs
def set_asg_desire(asg, count):
"""Update AutoScalingGroup desired instances number.
Parameters
----------
asg : string
Name of AutoScalingGroup.
count : int
New desired number of instances, must be about min number and
less than max.
Returns
-------
None
"""
client.set_desired_capacity(
AutoScalingGroupName=asg,
DesiredCapacity=count,
HonorCooldown=False
)
def describe_asg_policies(asg):
"""Get AutoScalingGroup scaling policies.
Parameters
----------
asg : string
Name of AutoScalingGroup.
Returns
-------
dict
AutoScalingGroup scaling policies object.
"""
response = client.describe_policies(
AutoScalingGroupName=asg
)
return response["ScalingPolicies"]
def attach_asg_policies(asg, policies):
"""Attach to AutoScalingGroup new policies.
Function run on all received policies and add them to AutoScalingGroup, if
they exists, they are updated. Cloudwatch Alarms connected to the policy
are updated, old policy is removed from Alarm action and new policy ARN is
added.
Parameters
----------
asg : string
Name of AutoScalingGroup.
policies : dict
AutoScalingGroup scaling policies object.
Returns
-------
None
"""
cw_client = boto3.client('cloudwatch',
region_name=REGION)
for policy in policies:
# Create new policy object and remove old policy info
new_policy = policy.copy()
new_policy.pop("Alarms", None)
new_policy.pop("PolicyARN", None)
new_policy["AutoScalingGroupName"] = asg
response = client.put_scaling_policy(
**new_policy
)
for alarm in policy["Alarms"]:
alarm_response = cw_client.describe_alarms(
AlarmNames=[alarm["AlarmName"]]
)
# Remove Alarm dynamic info
alarm_response["MetricAlarms"][0].pop("AlarmArn", None)
alarm_response["MetricAlarms"][0].\
pop("AlarmConfigurationUpdatedTimestamp", None)
alarm_response["MetricAlarms"][0].pop("StateValue", None)
alarm_response["MetricAlarms"][0].pop("StateReason", None)
alarm_response["MetricAlarms"][0].pop("StateReasonData", None)
alarm_response["MetricAlarms"][0].\
pop("StateUpdatedTimestamp", None)
# Remove old policy and add new one
alarm_response["MetricAlarms"][0]["AlarmActions"].\
remove(policy["PolicyARN"])
alarm_response["MetricAlarms"][0]["AlarmActions"].\
append(response["PolicyARN"])
cw_client.put_metric_alarm(
**alarm_response["MetricAlarms"][0]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment