Skip to content

Instantly share code, notes, and snippets.

@yteraoka
Created July 16, 2020 01:38
Show Gist options
  • Save yteraoka/cb91a7c38724c302ded7a58dc91fde25 to your computer and use it in GitHub Desktop.
Save yteraoka/cb91a7c38724c302ded7a58dc91fde25 to your computer and use it in GitHub Desktop.
#!/bin/bash
termination_policy="OldestLaunchTemplate OldestInstance"
info(){
echo -e "\033[32m$(date +%Y-%m-%dT%H:%M:%S) $*\033[m"
}
usage(){
if [ $# -ge 1 ] ; then
echo $*
echo
fi
echo "Usage: $0 ip-111-222-333-444.ap-northeast-1.compute.internal" 1>&2
}
nodename=$1
echo $nodename | grep -q '^ip-'
if [ $? -ne 0 ] ; then
usage
exit 1
fi
kubectl get node "${nodename}"
if [ $? -ne 0 ] ; then
usage "Unknown node: ${nodename}"
exit 1
fi
# aws:///ap-northeast-1a/i-0fc00637cc970e6f5
provider_id=$(kubectl get node -o json | jq -r ".items[] | select(.metadata.name == \"$nodename\") | .spec.providerID")
# i-0fc00637cc970e6f5
instance_id=$(basename $provider_id)
asgname=$(aws --output json ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].Tags[] | select(.Key == "aws:autoscaling:groupName") | .Value')
info "Autoscaling group name: ${asgname}"
tmpfile1=$(mktemp)
echo $tmpfile1
trap "rm -f $tmpfile1" EXIT
aws --output json autoscaling describe-auto-scaling-groups --auto-scaling-group-names ${asgname} > $tmpfile1
orig_asgmax=$(cat $tmpfile1 | jq -r '.AutoScalingGroups[0].MaxSize')
orig_asgdesired=$(cat $tmpfile1 | jq -r '.AutoScalingGroups[0].DesiredCapacity')
orig_termination_policy=$(cat $tmpfile1 | jq -r '.AutoScalingGroups[0].TerminationPolicies | @csv' | sed -e 's/"//g' -e 's/,/ /g')
old_instances=$(cat $tmpfile1 | jq -r '.AutoScalingGroups[0].Instances[] | .InstanceId')
info "asgmax: ${orig_asgmax}"
info "asgdesired: ${orig_asgdesired}"
info "termination_policy: ${orig_termination_policy}"
info asg_instances: ${old_instances}
new_asgdesired=$(( $orig_asgdesired * 2 ))
if [ ${orig_asgmax} -lt ${new_asgdesired} ] ; then
new_asgmax=${new_asgdesired}
else
new_asgmax=${orig_asgmax}
fi
info "Searching ${instance_id} in target groups"
target_group_arns=""
for target_group_arn in $(aws --output json elbv2 describe-target-groups --no-paginate | jq -r '.TargetGroups[].TargetGroupArn'); do
state=$(aws --output json elbv2 describe-target-health --target-group-arn $target_group_arn | jq -r ".TargetHealthDescriptions[] | select(.Target.Id == \"$instance_id\") | .TargetHealth.State")
if [ -n "$state" ] ; then
info "${instance_id} registerd in ${target_group_arn}, state: ${state}"
target_group_arns="${target_group_arns} ${target_group_arn}"
fi
done
# Increase number of instances
info "Updating ASG: TerminationPolicy=${termination_policy}, maxSize: ${new_asgmax}, desired: ${new_asgdesired}"
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${asgname} \
--termination-policies ${termination_policy} \
--max-size ${new_asgmax} \
--desired-capacity ${new_asgdesired}
info "Waiting for all instances getting ready"
while :; do
aws --output json autoscaling describe-auto-scaling-groups --auto-scaling-group-names ${asgname} \
| jq -M -c '.AutoScalingGroups[0].Instances[] | [.InstanceId, .InstanceType, .LifecycleState, .HealthStatus]' > $tmpfile1
num_ready_nodes=$(cat $tmpfile1 | grep -c '"InService","Healthy"')
if [ $num_ready_nodes -eq $new_asgdesired ] ; then
echo
info "All instances is ready"
cat $tmpfile1
break
fi
echo -n .
sleep 5
done
info "Waiting for all kubernetes nodes getting ready"
kubectl get node
while :; do
not_ready_nodes=$(kubectl get node | grep -v ^NAME | grep -v ' Ready ' | wc -l)
if [ $not_ready_nodes -eq 0 ] ; then
break
fi
done
kubectl get node
info "Drain and delete all old instances"
for instance_id in ${old_instances}; do
for target_group_arn in $target_group_arns; do
info "Deregistering ${instance_id} from target group ${target_group_arn}"
aws elbv2 deregister-targets --target-group-arn $target_group_arn --targets Id=${instance_id}
info "Waiting to be finished draining (300 seconds)"
while :; do
state=$(aws --output json elbv2 describe-target-health --target-group-arn $target_group_arn | jq -r ".TargetHealthDescriptions[] | select(.Target.Id == \"$instance_id\") | .TargetHealth.State")
if [ -z "$state" ] ; then
break
else
echo $state
fi
sleep 30
done
done
nodename=$(aws --output json ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].PrivateDnsName')
info "Draining $nodename (${instance_id})"
kubectl drain $nodename --ignore-daemonsets --delete-local-data
info "Deleting $nodename (${instance_id}) from EKS cluster"
kubectl delete node $nodename
done
# Drain された Pod が正常起動するのを待つ
# 元々ダメなやつがいるかもしれないので sleep するだけにしておく
sleep 60
info "kubectl get pods -A | grep -v Running"
kubectl get pods -A | grep -v Running
# revert autoscaling group changes
info "Updating ASG: TerminationPolicy=${orig_termination_policy}, maxSize: ${orig_asgmax}, desired: ${orig_asgdesired}"
aws autoscaling update-auto-scaling-group --auto-scaling-group-name ${asgname} \
--termination-policies ${orig_termination_policy} \
--max-size ${orig_asgmax} \
--desired-capacity ${orig_asgdesired}
info "Do not wait terminate instance completed"
aws --output json autoscaling describe-auto-scaling-groups --auto-scaling-group-names ${asgname} \
| jq -M -c '.AutoScalingGroups[0].Instances[] | [.InstanceId, .InstanceType, .LifecycleState, .HealthStatus]'
info "Try following command to check termination process"
echo
cat <<EOF
aws --output json autoscaling describe-auto-scaling-groups --auto-scaling-group-names ${asgname} \
| jq -M -c '.AutoScalingGroups[0].Instances[] | [.InstanceId, .InstanceType, .LifecycleState, .HealthStatus]'
EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment