Created
June 2, 2020 06:42
-
-
Save tom-butler/e058d4301dfb82d6c0d4502c7a7b3b1d to your computer and use it in GitHub Desktop.
A shell script for replacing instances in an ASG whilst ensuring the deployments stay healthy.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# set -e | |
# shellcheck disable=SC2178,SC2128 | |
# time to wait for deployments to be healthy | |
wait_limit=${1:-900} # default set to 15 mins | |
# nodes to patch in a single go | |
max_nodes=${2:-50} # default set to max 50 nodes | |
# Helper function that will return the name of the newest instance in the cluster | |
get-newest-instance-name(){ | |
name=$(kubectl get nodes -o custom-columns=":metadata.creationTimestamp,:metadata.name" --no-headers | sort -k1 -r | awk '{print $2}' | head -1) | |
echo "$name" | |
} | |
# Helper function that will return the name of the oldest instance in the cluster | |
get-oldest-instance-name(){ | |
name=$(kubectl get nodes -o custom-columns=":metadata.name" --sort-by=.metadata.creationTimestamp --no-headers | head -1) | |
echo "$name" | |
} | |
# Helper function that will return the creationTimestamp of the oldest instnce timestamp in the cluster | |
get-oldest-instance-timestamp(){ | |
iso=$(kubectl get nodes -o custom-columns=":metadata.creationTimestamp" --sort-by=.metadata.creationTimestamp --no-headers | head -1) | |
epoch=$(date -d"$iso" +%s) | |
echo "$epoch" | |
} | |
# Checks all deployments are healthy | |
wait-for-deployments(){ | |
# Wait max 15 mins per nodes for deployments to be healthy | |
max_wait=$wait_limit | |
ready=false | |
while [[ $max_wait -gt 0 ]] && [ $ready == false ]; do | |
# Check if deployments are healthy | |
ready=true | |
# deployment: an array of all the deployment names | |
mapfile -t deployment < <(kubectl get deployments --all-namespaces -o custom-columns=":metadata.name" --sort-by=.metadata.name --no-headers) | |
# available: an array of all available replicas same order as deployment | |
mapfile -t available < <(kubectl get deployments --all-namespaces -o custom-columns=":status.availableReplicas" --sort-by=.metadata.name --no-headers) | |
# desired: an array of all desired replicas same order as deployment | |
mapfile -t desired < <(kubectl get deployments --all-namespaces -o custom-columns=":status.replicas" --sort-by=.metadata.name --no-headers) | |
count="${#deployment[@]}" | |
for (( i=0; i<count; i++ )); do | |
if ! [[ "${desired[$i]}" =~ ^[0-9]+$ ]]; then | |
echo "Warning: Deployment ${deployment[$i]} doesn't have any replicas" | |
continue | |
fi | |
if [ "${available[$i]}" -lt "${desired[$i]}" ]; then | |
echo "Deployment ${deployment[$i]} not ready, desired pods: ${desired[$i]}, available pods: ${available[$i]}" | |
ready=false | |
fi | |
done | |
echo "Deployments ready: $ready" | |
if [ $ready = false ]; then | |
sleep 10 | |
max_wait=$((max_wait - 10)) | |
echo "Waited 10 seconds. Still waiting max. $max_wait" | |
fi | |
done | |
} | |
# From a node name, find the ASG the node is hosted in | |
find-asg(){ | |
instanceid=$(kubectl get nodes "$1" -o jsonpath='{.metadata.labels.instance-id}') | |
asg=$(aws ec2 describe-tags --filters "Name=resource-id,Values=$instanceid" "Name=key,Values=aws:autoscaling:groupName" | jq -r '.Tags[0].Value') | |
echo "$asg" | |
} | |
# Scale up the ASG | |
scale-up-asg(){ | |
max=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].MaxSize') | |
desired=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].DesiredCapacity') | |
desired=$((desired+1)) | |
if [ "$desired" -le "$max" ]; then | |
aws autoscaling set-desired-capacity --auto-scaling-group-name "$1" --desired-capacity "$desired" | |
else | |
echo "Warning: Autoscaling Group at max, cannot scale up prematurely" | |
fi | |
} | |
# Scale down the ASG (this expects the default settings of removing oldest node are set) | |
scale-down-asg(){ | |
min=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].MinSize') | |
desired=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names "$1" | jq '.AutoScalingGroups[0].DesiredCapacity') | |
desired=$((desired-1)) | |
if [ "$desired" -ge "$min" ]; then | |
aws autoscaling set-desired-capacity --auto-scaling-group-name "$1" --desired-capacity "$desired" | |
else | |
echo "Warning: Autoscaling Group at min, cannot scale down" | |
fi | |
} | |
echo "Checking deployments are healthy" | |
wait-for-deployments | |
if [ $ready = false ]; then | |
echo "Deployments not in healthy state" | |
# ensure cluster autoscaler is back online | |
kubectl scale deployment/cluster-autoscaler-aws-cluster-autoscaler -n admin --replicas 1 | |
exit 1 | |
fi | |
start_time=$(date '+%s') | |
echo "Starting to patch at time: $start_time" | |
# ensure we have 2 dns pods running | |
kubectl scale deployments/coredns --replicas=2 -n kube-system | |
# disable cluster autoscaler as it messes with stuff | |
kubectl scale deployment/cluster-autoscaler-aws-cluster-autoscaler -n admin --replicas 0 | |
node_count=0 | |
# Run until we have patched every node | |
until [ "$start_time" -lt "$(get-oldest-instance-timestamp)" ]; do | |
oldest_node=$(get-oldest-instance-name) | |
echo "Draining node $oldest_node" | |
# Scale up asg | |
asg=$(find-asg "$oldest_node") | |
scale-up-asg "$asg" | |
# Give it 60 seconds to create a new node | |
sleep 60 | |
# Wait until newest node is ready | |
newest_node=$(get-newest-instance-name) | |
while [[ $(kubectl get node $newest_node -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}') != "True" ]]; do | |
echo "Waiting for newest node to be ready: $newest_node" && sleep 1; | |
done | |
# Taint node with noschedule, then drain the pods off it, if something breaks move on | |
kubectl cordon "$oldest_node" && kubectl drain "$oldest_node" --delete-local-data --ignore-daemonsets --force || echo "Warning: node could not be drained, continuing" | |
# Wait until all deployments are healthy | |
echo "Waiting for deployments to be healthy" | |
wait-for-deployments | |
# scale down asg | |
scale-down-asg "$asg" | |
# Remove the node from kubernetes (So we don't keep trying to remove the same node) | |
kubectl delete node "$oldest_node" | |
node_count=$((node_count+1)) | |
if [ "$node_count" -ge "$max_nodes" ]; then | |
echo "Patched max number of nodes, finishing" | |
break | |
fi | |
done | |
#ensure cluster autoscaler is back online | |
kubectl scale deployment/cluster-autoscaler-aws-cluster-autoscaler -n admin --replicas 1 | |
echo "Patching complete, patched $node_count nodes" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment