Last active
February 20, 2022 23:24
-
-
Save kevinmehall/f3e53a0be9d59f0759d8fbba872bc8cf to your computer and use it in GitHub Desktop.
Script to drain and replace EC2 instances in an ECS cluster auto-scaling group after changing the AMI or instance type
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Script to replace EC2 instances in an ECS cluster's auto-scaling group after | |
# changing the AMI or instance type in the launch configuration. It | |
# checks for instances with the incorrect AMI or type, scales up the | |
# auto-scaling group with replacement instances, then drains the tasks | |
# from the old instances. | |
# | |
# Usage: aws-vault exec profile-name -- python3 replace_ecs_cluster_instances.py --group=asg-name --cluster=ecs-cluster-name --count=3 | |
# | |
# The count is specified so that it knows what the "real" desired count is in | |
# case it is interrupted and restarted after increasing the desired count. | |
# | |
# License: ISC | |
# Copyright 2019 3D Robotics | |
# Permission to use, copy, modify, and/or distribute this software for any | |
# purpose with or without fee is hereby granted, provided that the above | |
# copyright notice and this permission notice appear in all copies. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY | |
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER | |
# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, | |
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE | |
# USE OR PERFORMANCE OF THIS SOFTWARE. | |
# | |
import boto3 | |
import argparse | |
import time | |
autoscaling = boto3.client('autoscaling') | |
ec2 = boto3.client('ec2') | |
ecs = boto3.client('ecs') | |
parser = argparse.ArgumentParser(description='Update the instances in the autoscaling group.') | |
parser.add_argument('--group', metavar='NAME', required=True, help='Autoscaling group name') | |
parser.add_argument('--cluster', metavar='NAME', required=True, help='ECS cluster name') | |
parser.add_argument('--count', metavar='ARN', type=int, required=True, help='Desired count of instances in autoscaling group') | |
args = parser.parse_args() | |
group_name = args.group | |
cluster_name = args.cluster | |
desired_count = args.count | |
describe_group = autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[group_name])['AutoScalingGroups'][0] | |
assert(describe_group['AutoScalingGroupName'] == group_name) | |
desired_capacity = describe_group['DesiredCapacity'] | |
target_launch_template = describe_group['LaunchTemplate'] | |
asg_instances = [instance['InstanceId'] for instance in describe_group['Instances']] | |
prev_desired_count = describe_group['DesiredCapacity'] | |
print(f"Target launch template {target_launch_template['LaunchTemplateId']} {target_launch_template['Version']}") | |
describe_launch_template = ec2.describe_launch_template_versions( | |
LaunchTemplateId = target_launch_template['LaunchTemplateId'], | |
Versions = [target_launch_template['Version']] | |
)['LaunchTemplateVersions'][0] | |
target_ami = describe_launch_template['LaunchTemplateData']['ImageId'] | |
target_instance_type = describe_launch_template['LaunchTemplateData']['InstanceType'] | |
print(f"Target AMI {target_ami} on {target_instance_type}") | |
describe_instances_reservations = ec2.describe_instances(InstanceIds=asg_instances)['Reservations'] | |
instances_to_replace = [] | |
for reservation in describe_instances_reservations: | |
for instance in reservation['Instances']: | |
instance_id = instance['InstanceId'] | |
instance_ami = instance['ImageId'] | |
instance_type = instance['InstanceType'] | |
instance_launched = instance['LaunchTime'] | |
needs_replace = instance_ami != target_ami or instance_type !=target_instance_type | |
if needs_replace: | |
instances_to_replace.append(instance_id) | |
print(f"Instance {instance_id}, created {instance_launched.ctime()}, type {instance_type}, AMI {instance_ami} -- {'REPLACE' if needs_replace else 'OK'}") | |
new_desired_count = max(prev_desired_count, desired_count + len(instances_to_replace)) | |
print(f"Temporarily scaling cluster from {prev_desired_count} to {new_desired_count} instances") | |
autoscaling.set_desired_capacity(AutoScalingGroupName=group_name, DesiredCapacity=new_desired_count) | |
while True: | |
print('\n----\n') | |
list_container_instances = ecs.list_container_instances(cluster = cluster_name)['containerInstanceArns'] | |
container_instances = ecs.describe_container_instances(cluster = cluster_name, containerInstances = list_container_instances)['containerInstances'] | |
container_instances.sort(key = lambda ci: ci['registeredAt']) | |
available_instances = 0 | |
remaining_tasks = 0 | |
to_drain = [] | |
for ci in container_instances: | |
ci_ec2_id = ci['ec2InstanceId'] | |
ci_arn = ci['containerInstanceArn'] | |
running_tasks = ci['runningTasksCount'] | |
status = ci['status'] | |
print(f"{ci_ec2_id} {status}, {running_tasks} tasks") | |
if ci_ec2_id in instances_to_replace: | |
remaining_tasks += running_tasks | |
if status == 'ACTIVE': | |
to_drain.append(ci_arn) | |
elif status == 'ACTIVE': | |
available_instances += 1 | |
if available_instances < desired_count: | |
print("Waiting for new instances to boot") | |
elif len(to_drain) > 0: | |
print("Draining instances:", to_drain) | |
ecs.update_container_instances_state(cluster = cluster_name, containerInstances = to_drain, status='DRAINING') | |
elif remaining_tasks == 0: | |
break | |
else: | |
print("Waiting for instances to drain") | |
time.sleep(10) | |
for instance_id in instances_to_replace: | |
if input(f"Terminate instance {instance_id}? (y/n) ") == "y": | |
autoscaling.terminate_instance_in_auto_scaling_group(InstanceId=instance_id, ShouldDecrementDesiredCapacity=True) | |
print("Terminated instance") | |
else: | |
print("Not terminating this instance") | |
print("Done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@kevinmehall Thanks for sharing the script. I had a case where the temporary
new_desired_count
exceeded the maximium capacity for the autoscaing group. I fixed that by addingafter line 82. Then
autoscaling.update_auto_scaling_group(AutoScalingGroupName=group_name, MaxSize=max_capacity)
to bring the autoscaling max size back to normal at the end.