davidglvn · July 2, 2018 13:29
diff --git a/lambda_asg_failover.py b/lambda_asg_failover.py
 import boto3
 from os import getenv
 from json import loads

 # AutoScalingGroups Region
 REGION = getenv("REGION", "us-east-1")
 # Name of On-Demand failover AutoScalingGroup
 ON_DEMAND_ASG = getenv("ON_DEMAND_ASG", None)
 # Name of Spot AutoScalingGroup
 SPOT_ASG = getenv("SPOT_ASG", None)

 if not ON_DEMAND_ASG or not SPOT_ASG:
    raise Exception("Spot and/or On-Demand AutoScalingGroups names are not " +
                    "defined")

 client = boto3.client('autoscaling', region_name=REGION)


 def main(event, context):
    """Handle AWS Lambda.

    Function that recieves Cloudwatch state change notification(via SNS). If
    the state changed to `Alarm` it will do failover from Spot to On-Demand(
    Move policies from Spot ASG to On-Demand ASG). If the state is `Ok` it will
     bring down On-Demand instances and set Spot ASG as main(move scale
     policies back to Spot).

    Parameters
    ----------
    event : dict
        AWS Lamabda event.
    context : dict
        AWS Lamabda context.

    Returns
    -------
    None

    """
    print("Executing Lambda Failover function")
    alarm = loads(event["Records"][0]["Sns"]["Message"])

    if alarm["NewStateValue"] == "ALARM":
        print("'{}' Spots are down, doing a failover to '{}'".
              format(SPOT_ASG, ON_DEMAND_ASG))
        failover_asg(SPOT_ASG, ON_DEMAND_ASG)
    elif alarm["NewStateValue"] == "OK":
        print("'{}' Spots are back, rolling back the failover from '{}'".
              format(SPOT_ASG, ON_DEMAND_ASG))
        failover_asg(ON_DEMAND_ASG, SPOT_ASG)
        set_asg_desire(ON_DEMAND_ASG, 0)
    print('Failover is done')


 def failover_asg(source_asg, failover_asg):
    """Switch between to AutoScalingGroups.

    Usefull for failover from spot to on-demand and back.

    Parameters
    ----------
    source_asg : string
        Name of AutoScalingGroup to switch from.
    failover_asg : string
        Name of AutoScalingGroup to switch to.

    Returns
    -------
    None

    """
    # get info about ASGs
    asgs = describe_asgs([source_asg, failover_asg])
    # get source ASG policeis
    source_policies = describe_asg_policies(source_asg)
    # update failover ASG to same size as source ASG
    set_asg_desire(failover_asg, asgs[source_asg]["DesiredCapacity"])
    # attach policies from source asg to failover asg
    attach_asg_policies(failover_asg, source_policies)


 def describe_asgs(asgs):
    """Get information about AutoScalingGroups.

    Can retrieve information about multiple AutoScalingGroups.

    Parameters
    ----------
    asgs : list
        AutoScalingGroups names list.

    Returns
    -------
    dict
        Dictionary with AutoScalingGroups information, where each key represent
        AutoScalingGroup.

    """
    response = client.describe_auto_scaling_groups(
        AutoScalingGroupNames=asgs,
        MaxRecords=len(asgs)
    )

    response_asgs = {}
    for asg in response["AutoScalingGroups"]:
        response_asgs[asg["AutoScalingGroupName"]] = asg

    return response_asgs


 def set_asg_desire(asg, count):
    """Update AutoScalingGroup desired instances number.

    Parameters
    ----------
    asg : string
        Name of AutoScalingGroup.
    count : int
        New desired number of instances, must be about min number and
        less than max.

    Returns
    -------
    None

    """
    client.set_desired_capacity(
        AutoScalingGroupName=asg,
        DesiredCapacity=count,
        HonorCooldown=False
    )


 def describe_asg_policies(asg):
    """Get AutoScalingGroup scaling policies.

    Parameters
    ----------
    asg : string
        Name of AutoScalingGroup.

    Returns
    -------
    dict
        AutoScalingGroup scaling policies object.

    """
    response = client.describe_policies(
        AutoScalingGroupName=asg
    )

    return response["ScalingPolicies"]


 def attach_asg_policies(asg, policies):
    """Attach to AutoScalingGroup new policies.

    Function run on all received policies and add them to AutoScalingGroup, if
    they exists, they are updated. Cloudwatch Alarms connected to the policy
    are updated, old policy is removed from Alarm action and new policy ARN is
    added.

    Parameters
    ----------
    asg : string
        Name of AutoScalingGroup.
    policies : dict
        AutoScalingGroup scaling policies object.

    Returns
    -------
    None

    """
    cw_client = boto3.client('cloudwatch',
                             region_name=REGION)
    for policy in policies:
        # Create new policy object and remove old policy info
        new_policy = policy.copy()
        new_policy.pop("Alarms", None)
        new_policy.pop("PolicyARN", None)
        new_policy["AutoScalingGroupName"] = asg

        response = client.put_scaling_policy(
            **new_policy
        )

        for alarm in policy["Alarms"]:
            alarm_response = cw_client.describe_alarms(
                AlarmNames=[alarm["AlarmName"]]
            )
            # Remove Alarm dynamic info
            alarm_response["MetricAlarms"][0].pop("AlarmArn", None)
            alarm_response["MetricAlarms"][0].\
                pop("AlarmConfigurationUpdatedTimestamp", None)
            alarm_response["MetricAlarms"][0].pop("StateValue", None)
            alarm_response["MetricAlarms"][0].pop("StateReason", None)
            alarm_response["MetricAlarms"][0].pop("StateReasonData", None)
            alarm_response["MetricAlarms"][0].\
                pop("StateUpdatedTimestamp", None)
            # Remove old policy and add new one
            alarm_response["MetricAlarms"][0]["AlarmActions"].\
                remove(policy["PolicyARN"])
            alarm_response["MetricAlarms"][0]["AlarmActions"].\
                append(response["PolicyARN"])

            cw_client.put_metric_alarm(
                **alarm_response["MetricAlarms"][0]
            )
	import boto3
	from os import getenv
	from json import loads

	# AutoScalingGroups Region
	REGION = getenv("REGION", "us-east-1")
	# Name of On-Demand failover AutoScalingGroup
	ON_DEMAND_ASG = getenv("ON_DEMAND_ASG", None)
	# Name of Spot AutoScalingGroup
	SPOT_ASG = getenv("SPOT_ASG", None)

	if not ON_DEMAND_ASG or not SPOT_ASG:
	raise Exception("Spot and/or On-Demand AutoScalingGroups names are not " +
	"defined")

	client = boto3.client('autoscaling', region_name=REGION)


	def main(event, context):
	"""Handle AWS Lambda.

	Function that recieves Cloudwatch state change notification(via SNS). If
	the state changed to `Alarm` it will do failover from Spot to On-Demand(
	Move policies from Spot ASG to On-Demand ASG). If the state is `Ok` it will
	bring down On-Demand instances and set Spot ASG as main(move scale
	policies back to Spot).

	Parameters
	----------
	event : dict
	AWS Lamabda event.
	context : dict
	AWS Lamabda context.

	Returns
	-------
	None

	"""
	print("Executing Lambda Failover function")
	alarm = loads(event["Records"][0]["Sns"]["Message"])

	if alarm["NewStateValue"] == "ALARM":
	print("'{}' Spots are down, doing a failover to '{}'".
	format(SPOT_ASG, ON_DEMAND_ASG))
	failover_asg(SPOT_ASG, ON_DEMAND_ASG)
	elif alarm["NewStateValue"] == "OK":
	print("'{}' Spots are back, rolling back the failover from '{}'".
	format(SPOT_ASG, ON_DEMAND_ASG))
	failover_asg(ON_DEMAND_ASG, SPOT_ASG)
	set_asg_desire(ON_DEMAND_ASG, 0)
	print('Failover is done')


	def failover_asg(source_asg, failover_asg):
	"""Switch between to AutoScalingGroups.

	Usefull for failover from spot to on-demand and back.

	Parameters
	----------
	source_asg : string
	Name of AutoScalingGroup to switch from.
	failover_asg : string
	Name of AutoScalingGroup to switch to.

	Returns
	-------
	None

	"""
	# get info about ASGs
	asgs = describe_asgs([source_asg, failover_asg])
	# get source ASG policeis
	source_policies = describe_asg_policies(source_asg)
	# update failover ASG to same size as source ASG
	set_asg_desire(failover_asg, asgs[source_asg]["DesiredCapacity"])
	# attach policies from source asg to failover asg
	attach_asg_policies(failover_asg, source_policies)


	def describe_asgs(asgs):
	"""Get information about AutoScalingGroups.

	Can retrieve information about multiple AutoScalingGroups.

	Parameters
	----------
	asgs : list
	AutoScalingGroups names list.

	Returns
	-------
	dict
	Dictionary with AutoScalingGroups information, where each key represent
	AutoScalingGroup.

	"""
	response = client.describe_auto_scaling_groups(
	AutoScalingGroupNames=asgs,
	MaxRecords=len(asgs)
	)

	response_asgs = {}
	for asg in response["AutoScalingGroups"]:
	response_asgs[asg["AutoScalingGroupName"]] = asg

	return response_asgs


	def set_asg_desire(asg, count):
	"""Update AutoScalingGroup desired instances number.

	Parameters
	----------
	asg : string
	Name of AutoScalingGroup.
	count : int
	New desired number of instances, must be about min number and
	less than max.

	Returns
	-------
	None

	"""
	client.set_desired_capacity(
	AutoScalingGroupName=asg,
	DesiredCapacity=count,
	HonorCooldown=False
	)


	def describe_asg_policies(asg):
	"""Get AutoScalingGroup scaling policies.

	Parameters
	----------
	asg : string
	Name of AutoScalingGroup.

	Returns
	-------
	dict
	AutoScalingGroup scaling policies object.

	"""
	response = client.describe_policies(
	AutoScalingGroupName=asg
	)

	return response["ScalingPolicies"]


	def attach_asg_policies(asg, policies):
	"""Attach to AutoScalingGroup new policies.

	Function run on all received policies and add them to AutoScalingGroup, if
	they exists, they are updated. Cloudwatch Alarms connected to the policy
	are updated, old policy is removed from Alarm action and new policy ARN is
	added.

	Parameters
	----------
	asg : string
	Name of AutoScalingGroup.
	policies : dict
	AutoScalingGroup scaling policies object.

	Returns
	-------
	None

	"""
	cw_client = boto3.client('cloudwatch',
	region_name=REGION)
	for policy in policies:
	# Create new policy object and remove old policy info
	new_policy = policy.copy()
	new_policy.pop("Alarms", None)
	new_policy.pop("PolicyARN", None)
	new_policy["AutoScalingGroupName"] = asg

	response = client.put_scaling_policy(
	**new_policy
	)

	for alarm in policy["Alarms"]:
	alarm_response = cw_client.describe_alarms(
	AlarmNames=[alarm["AlarmName"]]
	)
	# Remove Alarm dynamic info
	alarm_response["MetricAlarms"][0].pop("AlarmArn", None)
	alarm_response["MetricAlarms"][0].\
	pop("AlarmConfigurationUpdatedTimestamp", None)
	alarm_response["MetricAlarms"][0].pop("StateValue", None)
	alarm_response["MetricAlarms"][0].pop("StateReason", None)
	alarm_response["MetricAlarms"][0].pop("StateReasonData", None)
	alarm_response["MetricAlarms"][0].\
	pop("StateUpdatedTimestamp", None)
	# Remove old policy and add new one
	alarm_response["MetricAlarms"][0]["AlarmActions"].\
	remove(policy["PolicyARN"])
	alarm_response["MetricAlarms"][0]["AlarmActions"].\
	append(response["PolicyARN"])

	cw_client.put_metric_alarm(
	**alarm_response["MetricAlarms"][0]
	)