Skip to content

Instantly share code, notes, and snippets.

@krushik
Last active February 26, 2025 17:32
Show Gist options
  • Save krushik/b4f6900391cf2731d189dd6f29e4343d to your computer and use it in GitHub Desktop.
Save krushik/b4f6900391cf2731d189dd6f29e4343d to your computer and use it in GitHub Desktop.
If you have a persistent Spot Fleet with some web apps, this script / systemd service will automatically register your ec2 instance with the needed ALB TargetGroups on system boot, then deregister it when Spot interruption notice comes. It will deregister from ALB in case of a regular system shutdown as well
#!/usr/bin/env python3
# automatic ec2 instanse registration with ALB TargetGroups on instance boot
# and deregistration on Spot interruption notice or instance shutdown
import json
import logging
import signal
import sys
import time
from datetime import datetime, timedelta, timezone
import boto3
import requests
from botocore.exceptions import ClientError
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('spot-lifecycle.py')
logger.setLevel(level=logging.DEBUG)
# delay to wait for our apps to start on instance boot
STARTUP_ALB_REGISTRATION_DELAY = timedelta(seconds=40)
# time period to reserve after deregistration from alb before the instance will be actually shutdown with spot interruption
ALB_DEREGISTRATION_TIME_BUFFER = timedelta(seconds=45)
ALB_TARGET_GROUPS = [
'arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY1/ZZZ1',
'arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY2/ZZZ2',
'arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY3/ZZZ3'
]
class SpotLifecycleHandler:
retry_strategy = Retry(
total=1000000,
backoff_factor=0.5,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["GET", "PUT"]
)
def __init__(self):
self.http = requests.Session()
self.http.mount('http://', HTTPAdapter(max_retries=self.retry_strategy))
self.token = None
self.token_timestamp = None
self.token_ttl = 21600
self.refresh_token()
self.instance_id = self._get_metadata('instance-id')
self.region = self._get_metadata('placement/region')
self.elbv2 = boto3.client('elbv2', region_name=self.region)
self.target_groups = ALB_TARGET_GROUPS
signal.signal(signal.SIGTERM, self.handle_sigterm)
def refresh_token(self):
"""Get new IMDSv2 token"""
token_response = self.http.put(
'http://169.254.169.254/latest/api/token',
headers={'X-aws-ec2-metadata-token-ttl-seconds': str(self.token_ttl)},
timeout=2
)
self.token = token_response.text
self.token_timestamp = datetime.now(timezone.utc)
logger.debug("IMDSv2 metadata token refreshed")
def _get_metadata(self, path):
"""Get metadata with automatic token refresh"""
if datetime.now(timezone.utc) > self.token_timestamp + timedelta(seconds=self.token_ttl - 300): # Refresh 5 mins before expiry
self.refresh_token()
response = self.http.get(
'http://169.254.169.254/latest/meta-data/{0}'.format(path),
headers={'X-aws-ec2-metadata-token': self.token},
timeout=2
)
return response.text
def register_with_alb(self):
"""Register instance with ALB target groups"""
for target_group in self.target_groups:
try:
self.elbv2.register_targets(
TargetGroupArn=target_group,
Targets=[{'Id': self.instance_id}]
)
logger.info("Registered with target group: {0}".format(target_group))
except ClientError as e:
logger.error("Failed to register with {0}: {1}".format(target_group, e))
def deregister_from_alb(self):
"""Deregister instance from ALB target groups"""
for target_group in self.target_groups:
try:
self.elbv2.deregister_targets(
TargetGroupArn=target_group,
Targets=[{'Id': self.instance_id}]
)
logger.info("Deregistered from target group: {0}".format(target_group))
except ClientError as e:
logger.error("Failed to deregister from {0}: {1}".format(target_group, e))
def wait_for_connection_draining(self, timeout=300):
"""Wait for connection draining. Just for logging purposes"""
start_time = time.monotonic()
while time.monotonic() - start_time < timeout:
all_drained = True
for target_group in self.target_groups:
try:
response = self.elbv2.describe_target_health(
TargetGroupArn=target_group,
Targets=[{'Id': self.instance_id}]
)
for target in response['TargetHealthDescriptions']:
if target['TargetHealth']['State'] != 'unused':
all_drained = False
break
except ClientError as e:
logger.error("Failed to check draining status: {0}".format(e))
all_drained = False
if all_drained:
logger.info("Connection draining completed")
break
time.sleep(5)
else:
logger.warning("Connection draining timed out")
def monitor_spot_interruption(self):
"""Monitor spot interruption notice"""
while True:
try:
time.sleep(5)
if datetime.now(timezone.utc) > self.token_timestamp + timedelta(seconds=self.token_ttl - 300): # Refresh 5 mins before expiry
self.refresh_token()
response = self.http.get(
'http://169.254.169.254/latest/meta-data/spot/instance-action',
headers={'X-aws-ec2-metadata-token': self.token},
timeout=1
)
if response.status_code == 200:
interruption_data = json.loads(response.text)
stop_time = datetime.strptime(interruption_data['time'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc)
wait_time = (stop_time - ALB_DEREGISTRATION_TIME_BUFFER - datetime.now(timezone.utc)).total_seconds()
if wait_time > 0:
logger.info("Waiting {:.1f} seconds before deregistering".format(wait_time))
time.sleep(wait_time)
self.deregister_from_alb()
self.wait_for_connection_draining()
break
except requests.exceptions.RequestException:
logger.warning("Failed to get spot interruption notice from instance metadata")
except KeyboardInterrupt:
self.handle_sigterm(signal.SIGTERM, None)
def handle_sigterm(self, signal_number, frame):
logger.warning("Received SIGTERM signal")
self.deregister_from_alb()
sys.exit(0)
if __name__ == '__main__':
handler = SpotLifecycleHandler()
time.sleep(STARTUP_ALB_REGISTRATION_DELAY.total_seconds())
handler.register_with_alb()
handler.monitor_spot_interruption()
[Unit]
Description=Spot Instance Lifecycle Handler
After=network.target
[Service]
ExecStart=/usr/local/bin/spot-lifecycle.py
# you can change this to DynamicUser=yes if you have python >= 3.6
User=nobody
Group=nogroup
[Install]
WantedBy=multi-user.target
@krushik
Copy link
Author

krushik commented Feb 24, 2025

You will need to attach an instance profile and role to these instances that includes a policy with permissions to RegisterTargets, DeregisterTargets, and DescribeTargetHealth for your ALB target groups. Here's an example of such a policy:

# ec2-alb-auto-registration iam policy for app YYY
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "elasticloadbalancing:RegisterTargets",
                "elasticloadbalancing:DeregisterTargets"
            ],
            "Resource": [
                "arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY1/ZZZ1",
                "arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY2/ZZZ2",
                "arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY3/ZZZ3"
            ]
        },
        {
            "Sid": "VisualEditor1",
            "Effect": "Allow",
            "Action": "elasticloadbalancing:DescribeTargetHealth",
            "Resource": "*"
        }
    ]
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment