Last active
February 26, 2025 17:32
-
-
Save krushik/b4f6900391cf2731d189dd6f29e4343d to your computer and use it in GitHub Desktop.
If you have a persistent Spot Fleet with some web apps, this script / systemd service will automatically register your ec2 instance with the needed ALB TargetGroups on system boot, then deregister it when Spot interruption notice comes. It will deregister from ALB in case of a regular system shutdown as well
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# automatic ec2 instanse registration with ALB TargetGroups on instance boot | |
# and deregistration on Spot interruption notice or instance shutdown | |
import json | |
import logging | |
import signal | |
import sys | |
import time | |
from datetime import datetime, timedelta, timezone | |
import boto3 | |
import requests | |
from botocore.exceptions import ClientError | |
from requests.adapters import HTTPAdapter | |
from urllib3.util import Retry | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger('spot-lifecycle.py') | |
logger.setLevel(level=logging.DEBUG) | |
# delay to wait for our apps to start on instance boot | |
STARTUP_ALB_REGISTRATION_DELAY = timedelta(seconds=40) | |
# time period to reserve after deregistration from alb before the instance will be actually shutdown with spot interruption | |
ALB_DEREGISTRATION_TIME_BUFFER = timedelta(seconds=45) | |
ALB_TARGET_GROUPS = [ | |
'arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY1/ZZZ1', | |
'arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY2/ZZZ2', | |
'arn:aws:elasticloadbalancing:eu-west-1:XXX:targetgroup/YYY3/ZZZ3' | |
] | |
class SpotLifecycleHandler: | |
retry_strategy = Retry( | |
total=1000000, | |
backoff_factor=0.5, | |
status_forcelist=[429, 500, 502, 503, 504], | |
method_whitelist=["GET", "PUT"] | |
) | |
def __init__(self): | |
self.http = requests.Session() | |
self.http.mount('http://', HTTPAdapter(max_retries=self.retry_strategy)) | |
self.token = None | |
self.token_timestamp = None | |
self.token_ttl = 21600 | |
self.refresh_token() | |
self.instance_id = self._get_metadata('instance-id') | |
self.region = self._get_metadata('placement/region') | |
self.elbv2 = boto3.client('elbv2', region_name=self.region) | |
self.target_groups = ALB_TARGET_GROUPS | |
signal.signal(signal.SIGTERM, self.handle_sigterm) | |
def refresh_token(self): | |
"""Get new IMDSv2 token""" | |
token_response = self.http.put( | |
'http://169.254.169.254/latest/api/token', | |
headers={'X-aws-ec2-metadata-token-ttl-seconds': str(self.token_ttl)}, | |
timeout=2 | |
) | |
self.token = token_response.text | |
self.token_timestamp = datetime.now(timezone.utc) | |
logger.debug("IMDSv2 metadata token refreshed") | |
def _get_metadata(self, path): | |
"""Get metadata with automatic token refresh""" | |
if datetime.now(timezone.utc) > self.token_timestamp + timedelta(seconds=self.token_ttl - 300): # Refresh 5 mins before expiry | |
self.refresh_token() | |
response = self.http.get( | |
'http://169.254.169.254/latest/meta-data/{0}'.format(path), | |
headers={'X-aws-ec2-metadata-token': self.token}, | |
timeout=2 | |
) | |
return response.text | |
def register_with_alb(self): | |
"""Register instance with ALB target groups""" | |
for target_group in self.target_groups: | |
try: | |
self.elbv2.register_targets( | |
TargetGroupArn=target_group, | |
Targets=[{'Id': self.instance_id}] | |
) | |
logger.info("Registered with target group: {0}".format(target_group)) | |
except ClientError as e: | |
logger.error("Failed to register with {0}: {1}".format(target_group, e)) | |
def deregister_from_alb(self): | |
"""Deregister instance from ALB target groups""" | |
for target_group in self.target_groups: | |
try: | |
self.elbv2.deregister_targets( | |
TargetGroupArn=target_group, | |
Targets=[{'Id': self.instance_id}] | |
) | |
logger.info("Deregistered from target group: {0}".format(target_group)) | |
except ClientError as e: | |
logger.error("Failed to deregister from {0}: {1}".format(target_group, e)) | |
def wait_for_connection_draining(self, timeout=300): | |
"""Wait for connection draining. Just for logging purposes""" | |
start_time = time.monotonic() | |
while time.monotonic() - start_time < timeout: | |
all_drained = True | |
for target_group in self.target_groups: | |
try: | |
response = self.elbv2.describe_target_health( | |
TargetGroupArn=target_group, | |
Targets=[{'Id': self.instance_id}] | |
) | |
for target in response['TargetHealthDescriptions']: | |
if target['TargetHealth']['State'] != 'unused': | |
all_drained = False | |
break | |
except ClientError as e: | |
logger.error("Failed to check draining status: {0}".format(e)) | |
all_drained = False | |
if all_drained: | |
logger.info("Connection draining completed") | |
break | |
time.sleep(5) | |
else: | |
logger.warning("Connection draining timed out") | |
def monitor_spot_interruption(self): | |
"""Monitor spot interruption notice""" | |
while True: | |
try: | |
time.sleep(5) | |
if datetime.now(timezone.utc) > self.token_timestamp + timedelta(seconds=self.token_ttl - 300): # Refresh 5 mins before expiry | |
self.refresh_token() | |
response = self.http.get( | |
'http://169.254.169.254/latest/meta-data/spot/instance-action', | |
headers={'X-aws-ec2-metadata-token': self.token}, | |
timeout=1 | |
) | |
if response.status_code == 200: | |
interruption_data = json.loads(response.text) | |
stop_time = datetime.strptime(interruption_data['time'], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=timezone.utc) | |
wait_time = (stop_time - ALB_DEREGISTRATION_TIME_BUFFER - datetime.now(timezone.utc)).total_seconds() | |
if wait_time > 0: | |
logger.info("Waiting {:.1f} seconds before deregistering".format(wait_time)) | |
time.sleep(wait_time) | |
self.deregister_from_alb() | |
self.wait_for_connection_draining() | |
break | |
except requests.exceptions.RequestException: | |
logger.warning("Failed to get spot interruption notice from instance metadata") | |
except KeyboardInterrupt: | |
self.handle_sigterm(signal.SIGTERM, None) | |
def handle_sigterm(self, signal_number, frame): | |
logger.warning("Received SIGTERM signal") | |
self.deregister_from_alb() | |
sys.exit(0) | |
if __name__ == '__main__': | |
handler = SpotLifecycleHandler() | |
time.sleep(STARTUP_ALB_REGISTRATION_DELAY.total_seconds()) | |
handler.register_with_alb() | |
handler.monitor_spot_interruption() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Spot Instance Lifecycle Handler | |
After=network.target | |
[Service] | |
ExecStart=/usr/local/bin/spot-lifecycle.py | |
# you can change this to DynamicUser=yes if you have python >= 3.6 | |
User=nobody | |
Group=nogroup | |
[Install] | |
WantedBy=multi-user.target |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You will need to attach an instance profile and role to these instances that includes a policy with permissions to RegisterTargets, DeregisterTargets, and DescribeTargetHealth for your ALB target groups. Here's an example of such a policy: