Skip to content

Instantly share code, notes, and snippets.

@circa10a
Created July 24, 2025 23:40
Show Gist options
  • Save circa10a/0fa138d115126c4ef0d5b478eb79838d to your computer and use it in GitHub Desktop.
Save circa10a/0fa138d115126c4ef0d5b478eb79838d to your computer and use it in GitHub Desktop.
Automatically terraform untaint AWS autoscaling groups if instance capacity is fulfilled
#!/usr/bin/env bash
set -eo pipefail
# This script checks for tainted resources and ensures desired capacity is met, then auto untaints the resource.
# Tainted resources exist when CSPs can't fully provision resources (like ASGs) due to capacity constraints.
# This is to eliminate manual untaints, since our build processes halt destruction of these resources so we don't lose acquired capacity.
# Supports AWS standard + govcloud in case you were wondering.
CSP="$1"
STATE_JSON=$(mktemp)
IAM_ROLE_ARN="<arn>"
AWS_CREDS=""
log() {
level="$1"
msg="$2"
echo "$(basename ${0}): [${level}] ${msg}"
}
if [ -z "$CSP" ]; then
log "ERROR" "One argument of CSP name expected"
exit 1
fi
get_state() {
terraform state pull >"$STATE_JSON"
}
has_tainted_asgs() {
jq -e '
.resources[]
| select(.type == "aws_autoscaling_group")
| .instances[]
| select(.status == "tainted")
' "$STATE_JSON" >/dev/null
}
assume_role() {
# Parse region from the ARN: arn:aws-us-gov:iam::<account>:role/Name
role_region=$(jq -r --arg arn "$role_arn" '
($arn | split(":"))[1] as $partition |
if $partition == "aws-us-gov" then "us-gov-west-1" else "us-east-1" end
' <<<"{}")
log "INFO" "Assuming role: '${role_arn}' using STS region '${role_region}'"
AWS_CREDS=$(aws sts assume-role \
--region "$role_region" \
--role-arn "$role_arn" \
--role-session-name untaint-session)
export AWS_ACCESS_KEY_ID=$(jq -r '.Credentials.AccessKeyId' <<<"$AWS_CREDS")
export AWS_SECRET_ACCESS_KEY=$(jq -r '.Credentials.SecretAccessKey' <<<"$AWS_CREDS")
export AWS_SESSION_TOKEN=$(jq -r '.Credentials.SessionToken' <<<"$AWS_CREDS")
}
handle_aws() {
# arn can sometimes be null in state, if so, we fallback to region in the outputs
fallback_region=$(jq -r '.outputs.region.value? // empty' "$STATE_JSON")
# Safely read tainted ASGs into a bash array
readarray -t tainted_asgs < <(
jq -c --arg fallback "$fallback_region" '
.resources[]
| select(.type=="aws_autoscaling_group") as $r
| .instances[]
| select(.status=="tainted")
| {
id: .attributes.id,
region: (if .attributes.arn != null
then (.attributes.arn | split(":")[3])
else $fallback
end),
address: (
($r.module // "") +
(if $r.module then "." else "" end) +
$r.type+"."+ $r.name +
"[" + (.index_key|tojson) + "]"
)
}
' "$STATE_JSON"
)
for asg in "${tainted_asgs[@]}"; do
asg_name=$(jq -r '.id' <<<"$asg")
region=$(jq -r '.region' <<<"$asg")
state_address=$(jq -r '.address' <<<"$asg")
log "INFO" "Checking capacity for ASG '${asg_name}' in region '${region}'. State address is '${state_address}'"
capacity=$(aws autoscaling describe-auto-scaling-groups \
--auto-scaling-group-names "$asg_name" \
--region "$region" \
--query 'AutoScalingGroups[0].{Desired:DesiredCapacity,InService: length(Instances[?LifecycleState==`InService`])}' \
--output json)
desired=$(jq -r '.Desired' <<<"$capacity")
inservice=$(jq -r '.InService' <<<"$capacity")
if [ "$desired" -eq "$inservice" ]; then
log "INFO" "Desired capacity met: $inservice/$desired"
# Unset assumed role creds so terraform uses EC2/Jenkins identity
unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN
terraform untaint "$state_address"
# Re-export assumed creds for next iteration
export AWS_ACCESS_KEY_ID=$(jq -r '.Credentials.AccessKeyId' <<<"$AWS_CREDS")
export AWS_SECRET_ACCESS_KEY=$(jq -r '.Credentials.SecretAccessKey' <<<"$AWS_CREDS")
export AWS_SESSION_TOKEN=$(jq -r '.Credentials.SessionToken' <<<"$AWS_CREDS")
else
log "ERROR" "Desired capacity not met: $inservice/$desired"
exit 1
fi
done
}
case "$CSP" in
aws)
get_state
log "INFO" "Checking for tainted ASGs"
if has_tainted_asgs; then
log "INFO" "Tainted ASGs found; assuming role and processing..."
assume_role
handle_aws
else
log "INFO" "No tainted ASGs found. Nothing to do."
fi
;;
*)
log "WARN" "CSP '${CSP}' not supported"
exit 0
;;
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment