Skip to content

Instantly share code, notes, and snippets.

@tomasaschan
Last active August 30, 2022 17:45
Show Gist options
  • Save tomasaschan/9dbc9180d313ad8cae57f62ce229610b to your computer and use it in GitHub Desktop.
Save tomasaschan/9dbc9180d313ad8cae57f62ce229610b to your computer and use it in GitHub Desktop.
Rolling restart of all nodes in an AKS cluster
#!/bin/bash
set -e
resourceGroupDefault='<set your default here, to avoid having to specify in the common case>'
resourceGroup=${RESOURCE_GROUP:-$resourceGroupDefault}
clusterNameDefault='<set your default here>'
clusterName=${CLUSTER_NAME:-$clusterNameDefault}
regionDefault='<set your default here>'
region=${REGION:-$regionDefault}
force=false
dryrun=false
nodes=''
function print_usage() {
echo "Usage: $0 [<options>]"
echo ""
echo "-n|--node <node> The name of a node to restart."
echo " By default, a rolling restart of all nodes"
echo " is performed."
echo ""
echo "--resource-group <group-name> The resource group of the cluster."
echo " Can also be set by RESOURCE_GROUP"
echo " Default: $resourceGroupDefault"
echo ""
echo "--cluster-name <cluster-name> The name of the cluster."
echo " Can also be set by CLUSTER_NAME"
echo " Default: $clusterNameDefault"
echo ""
echo "--region <azure-region> The Azure region in which the cluster is."
echo " Can also be set by REGION"
echo " Default: $regionDefault"
echo ""
echo "-f|--force Restart node(s) without first draining."
echo " Useful if draining a node fails."
echo ""
echo "-d|--dry-run Just print what to do; don't actually do it"
echo ""
echo "-h|--help Print usage and exit."
}
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-n|--node)
node="$2"
shift
shift
;;
--resource-group)
resourceGroup="$2"
shift
shift
;;
--cluster-name)
clusterName="$2"
shift
shift
;;
--region)
region="$2"
shift
shift
;;
-f|--force)
force=true
shift
;;
--dry-run)
dryrun=true
shift
;;
-h|--help)
print_usage
exit 0
;;
*)
print_usage
exit 1
;;
esac
done
group="MC_${resourceGroup}_${clusterName}_$region"
function wait_for_status() {
node=$1
reason=$2
i=0
while [[ $i -lt 30 ]]; do
status=$(kubectl get node $node -o "jsonpath={.status.conditions[?(.reason==\"$reason\")].type}")
if [[ "$status" == "Ready" ]]; then
echo "$reason after $((i*2)) seconds"
break;
else
sleep 2s
i=$(($i+1))
fi
done
if [[ $i == 30 ]]; then
echo "Error: Did not reach $reason state within 1 minute"
exit 1
fi
}
if [ -z "$node" ]; then
nodes=$(kubectl get nodes -o jsonpath={.items[*].metadata.name})
else
nodes="$node"
fi
for node in $nodes; do
if $force; then
echo "WARNING: --force specified, restarting node $node without draining first"
if $dryrun; then
echo "kubectl cordon $node"
else
kubectl cordon "$node"
fi
else
echo "Draining $node..."
if $dryrun; then
echo "kubectl drain $node --ignore-daemonsets --delete-local-data"
else
kubectl drain "$node" --ignore-daemonsets --delete-local-data
fi
fi
echo "Initiating VM restart for $node..."
if $dryrun; then
echo "az vm restart --resource-group $group --name $node"
else
az vm restart --resource-group "$group" --name "$node"
fi
if ! $dryrun; then
echo "Waiting for $node to start back up..."
wait_for_status $node KubeletNotReady
wait_for_status $node KubeletReady
fi
echo "Re-enabling $node for scheduling"
if $dryrun; then
echo "kubectl uncordon $node"
else
kubectl uncordon "$node"
fi
done
@gvanriper
Copy link

@tomasaschan, you're correct that it would result in downtime. I just wanted anyone following this that hadn't noticed to know that AKS start/stop is now an option and supported by MS.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment