-
-
Save filipeandre/755067239685da87e64ed55dbe303d9a to your computer and use it in GitHub Desktop.
Failover from spot to on-demand group
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import boto3 | |
| from os import getenv | |
| from json import loads | |
| # AutoScalingGroups Region | |
| REGION = getenv("REGION", "us-east-1") | |
| # Name of On-Demand failover AutoScalingGroup | |
| ON_DEMAND_ASG = getenv("ON_DEMAND_ASG", None) | |
| # Name of Spot AutoScalingGroup | |
| SPOT_ASG = getenv("SPOT_ASG", None) | |
| if not ON_DEMAND_ASG or not SPOT_ASG: | |
| raise Exception("Spot and/or On-Demand AutoScalingGroups names are not " + | |
| "defined") | |
| client = boto3.client('autoscaling', region_name=REGION) | |
| def main(event, context): | |
| """Handle AWS Lambda. | |
| Function that recieves Cloudwatch state change notification(via SNS). If | |
| the state changed to `Alarm` it will do failover from Spot to On-Demand( | |
| Move policies from Spot ASG to On-Demand ASG). If the state is `Ok` it will | |
| bring down On-Demand instances and set Spot ASG as main(move scale | |
| policies back to Spot). | |
| Parameters | |
| ---------- | |
| event : dict | |
| AWS Lamabda event. | |
| context : dict | |
| AWS Lamabda context. | |
| Returns | |
| ------- | |
| None | |
| """ | |
| print("Executing Lambda Failover function") | |
| alarm = loads(event["Records"][0]["Sns"]["Message"]) | |
| if alarm["NewStateValue"] == "ALARM": | |
| print("'{}' Spots are down, doing a failover to '{}'". | |
| format(SPOT_ASG, ON_DEMAND_ASG)) | |
| failover_asg(SPOT_ASG, ON_DEMAND_ASG) | |
| elif alarm["NewStateValue"] == "OK": | |
| print("'{}' Spots are back, rolling back the failover from '{}'". | |
| format(SPOT_ASG, ON_DEMAND_ASG)) | |
| failover_asg(ON_DEMAND_ASG, SPOT_ASG) | |
| set_asg_desire(ON_DEMAND_ASG, 0) | |
| print('Failover is done') | |
| def failover_asg(source_asg, failover_asg): | |
| """Switch between to AutoScalingGroups. | |
| Usefull for failover from spot to on-demand and back. | |
| Parameters | |
| ---------- | |
| source_asg : string | |
| Name of AutoScalingGroup to switch from. | |
| failover_asg : string | |
| Name of AutoScalingGroup to switch to. | |
| Returns | |
| ------- | |
| None | |
| """ | |
| # get info about ASGs | |
| asgs = describe_asgs([source_asg, failover_asg]) | |
| # get source ASG policeis | |
| source_policies = describe_asg_policies(source_asg) | |
| # update failover ASG to same size as source ASG | |
| set_asg_desire(failover_asg, asgs[source_asg]["DesiredCapacity"]) | |
| # attach policies from source asg to failover asg | |
| attach_asg_policies(failover_asg, source_policies) | |
| def describe_asgs(asgs): | |
| """Get information about AutoScalingGroups. | |
| Can retrieve information about multiple AutoScalingGroups. | |
| Parameters | |
| ---------- | |
| asgs : list | |
| AutoScalingGroups names list. | |
| Returns | |
| ------- | |
| dict | |
| Dictionary with AutoScalingGroups information, where each key represent | |
| AutoScalingGroup. | |
| """ | |
| response = client.describe_auto_scaling_groups( | |
| AutoScalingGroupNames=asgs, | |
| MaxRecords=len(asgs) | |
| ) | |
| response_asgs = {} | |
| for asg in response["AutoScalingGroups"]: | |
| response_asgs[asg["AutoScalingGroupName"]] = asg | |
| return response_asgs | |
| def set_asg_desire(asg, count): | |
| """Update AutoScalingGroup desired instances number. | |
| Parameters | |
| ---------- | |
| asg : string | |
| Name of AutoScalingGroup. | |
| count : int | |
| New desired number of instances, must be about min number and | |
| less than max. | |
| Returns | |
| ------- | |
| None | |
| """ | |
| client.set_desired_capacity( | |
| AutoScalingGroupName=asg, | |
| DesiredCapacity=count, | |
| HonorCooldown=False | |
| ) | |
| def describe_asg_policies(asg): | |
| """Get AutoScalingGroup scaling policies. | |
| Parameters | |
| ---------- | |
| asg : string | |
| Name of AutoScalingGroup. | |
| Returns | |
| ------- | |
| dict | |
| AutoScalingGroup scaling policies object. | |
| """ | |
| response = client.describe_policies( | |
| AutoScalingGroupName=asg | |
| ) | |
| return response["ScalingPolicies"] | |
| def attach_asg_policies(asg, policies): | |
| """Attach to AutoScalingGroup new policies. | |
| Function run on all received policies and add them to AutoScalingGroup, if | |
| they exists, they are updated. Cloudwatch Alarms connected to the policy | |
| are updated, old policy is removed from Alarm action and new policy ARN is | |
| added. | |
| Parameters | |
| ---------- | |
| asg : string | |
| Name of AutoScalingGroup. | |
| policies : dict | |
| AutoScalingGroup scaling policies object. | |
| Returns | |
| ------- | |
| None | |
| """ | |
| cw_client = boto3.client('cloudwatch', | |
| region_name=REGION) | |
| for policy in policies: | |
| # Create new policy object and remove old policy info | |
| new_policy = policy.copy() | |
| new_policy.pop("Alarms", None) | |
| new_policy.pop("PolicyARN", None) | |
| new_policy["AutoScalingGroupName"] = asg | |
| response = client.put_scaling_policy( | |
| **new_policy | |
| ) | |
| for alarm in policy["Alarms"]: | |
| alarm_response = cw_client.describe_alarms( | |
| AlarmNames=[alarm["AlarmName"]] | |
| ) | |
| # Remove Alarm dynamic info | |
| alarm_response["MetricAlarms"][0].pop("AlarmArn", None) | |
| alarm_response["MetricAlarms"][0].\ | |
| pop("AlarmConfigurationUpdatedTimestamp", None) | |
| alarm_response["MetricAlarms"][0].pop("StateValue", None) | |
| alarm_response["MetricAlarms"][0].pop("StateReason", None) | |
| alarm_response["MetricAlarms"][0].pop("StateReasonData", None) | |
| alarm_response["MetricAlarms"][0].\ | |
| pop("StateUpdatedTimestamp", None) | |
| # Remove old policy and add new one | |
| alarm_response["MetricAlarms"][0]["AlarmActions"].\ | |
| remove(policy["PolicyARN"]) | |
| alarm_response["MetricAlarms"][0]["AlarmActions"].\ | |
| append(response["PolicyARN"]) | |
| cw_client.put_metric_alarm( | |
| **alarm_response["MetricAlarms"][0] | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment