Created
July 24, 2025 23:40
-
-
Save circa10a/0fa138d115126c4ef0d5b478eb79838d to your computer and use it in GitHub Desktop.
Automatically terraform untaint AWS autoscaling groups if instance capacity is fulfilled
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -eo pipefail | |
# This script checks for tainted resources and ensures desired capacity is met, then auto untaints the resource. | |
# Tainted resources exist when CSPs can't fully provision resources (like ASGs) due to capacity constraints. | |
# This is to eliminate manual untaints, since our build processes halt destruction of these resources so we don't lose acquired capacity. | |
# Supports AWS standard + govcloud in case you were wondering. | |
CSP="$1" | |
STATE_JSON=$(mktemp) | |
IAM_ROLE_ARN="<arn>" | |
AWS_CREDS="" | |
log() { | |
level="$1" | |
msg="$2" | |
echo "$(basename ${0}): [${level}] ${msg}" | |
} | |
if [ -z "$CSP" ]; then | |
log "ERROR" "One argument of CSP name expected" | |
exit 1 | |
fi | |
get_state() { | |
terraform state pull >"$STATE_JSON" | |
} | |
has_tainted_asgs() { | |
jq -e ' | |
.resources[] | |
| select(.type == "aws_autoscaling_group") | |
| .instances[] | |
| select(.status == "tainted") | |
' "$STATE_JSON" >/dev/null | |
} | |
assume_role() { | |
# Parse region from the ARN: arn:aws-us-gov:iam::<account>:role/Name | |
role_region=$(jq -r --arg arn "$role_arn" ' | |
($arn | split(":"))[1] as $partition | | |
if $partition == "aws-us-gov" then "us-gov-west-1" else "us-east-1" end | |
' <<<"{}") | |
log "INFO" "Assuming role: '${role_arn}' using STS region '${role_region}'" | |
AWS_CREDS=$(aws sts assume-role \ | |
--region "$role_region" \ | |
--role-arn "$role_arn" \ | |
--role-session-name untaint-session) | |
export AWS_ACCESS_KEY_ID=$(jq -r '.Credentials.AccessKeyId' <<<"$AWS_CREDS") | |
export AWS_SECRET_ACCESS_KEY=$(jq -r '.Credentials.SecretAccessKey' <<<"$AWS_CREDS") | |
export AWS_SESSION_TOKEN=$(jq -r '.Credentials.SessionToken' <<<"$AWS_CREDS") | |
} | |
handle_aws() { | |
# arn can sometimes be null in state, if so, we fallback to region in the outputs | |
fallback_region=$(jq -r '.outputs.region.value? // empty' "$STATE_JSON") | |
# Safely read tainted ASGs into a bash array | |
readarray -t tainted_asgs < <( | |
jq -c --arg fallback "$fallback_region" ' | |
.resources[] | |
| select(.type=="aws_autoscaling_group") as $r | |
| .instances[] | |
| select(.status=="tainted") | |
| { | |
id: .attributes.id, | |
region: (if .attributes.arn != null | |
then (.attributes.arn | split(":")[3]) | |
else $fallback | |
end), | |
address: ( | |
($r.module // "") + | |
(if $r.module then "." else "" end) + | |
$r.type+"."+ $r.name + | |
"[" + (.index_key|tojson) + "]" | |
) | |
} | |
' "$STATE_JSON" | |
) | |
for asg in "${tainted_asgs[@]}"; do | |
asg_name=$(jq -r '.id' <<<"$asg") | |
region=$(jq -r '.region' <<<"$asg") | |
state_address=$(jq -r '.address' <<<"$asg") | |
log "INFO" "Checking capacity for ASG '${asg_name}' in region '${region}'. State address is '${state_address}'" | |
capacity=$(aws autoscaling describe-auto-scaling-groups \ | |
--auto-scaling-group-names "$asg_name" \ | |
--region "$region" \ | |
--query 'AutoScalingGroups[0].{Desired:DesiredCapacity,InService: length(Instances[?LifecycleState==`InService`])}' \ | |
--output json) | |
desired=$(jq -r '.Desired' <<<"$capacity") | |
inservice=$(jq -r '.InService' <<<"$capacity") | |
if [ "$desired" -eq "$inservice" ]; then | |
log "INFO" "Desired capacity met: $inservice/$desired" | |
# Unset assumed role creds so terraform uses EC2/Jenkins identity | |
unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN | |
terraform untaint "$state_address" | |
# Re-export assumed creds for next iteration | |
export AWS_ACCESS_KEY_ID=$(jq -r '.Credentials.AccessKeyId' <<<"$AWS_CREDS") | |
export AWS_SECRET_ACCESS_KEY=$(jq -r '.Credentials.SecretAccessKey' <<<"$AWS_CREDS") | |
export AWS_SESSION_TOKEN=$(jq -r '.Credentials.SessionToken' <<<"$AWS_CREDS") | |
else | |
log "ERROR" "Desired capacity not met: $inservice/$desired" | |
exit 1 | |
fi | |
done | |
} | |
case "$CSP" in | |
aws) | |
get_state | |
log "INFO" "Checking for tainted ASGs" | |
if has_tainted_asgs; then | |
log "INFO" "Tainted ASGs found; assuming role and processing..." | |
assume_role | |
handle_aws | |
else | |
log "INFO" "No tainted ASGs found. Nothing to do." | |
fi | |
;; | |
*) | |
log "WARN" "CSP '${CSP}' not supported" | |
exit 0 | |
;; | |
esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment