-
-
Save tomasaschan/9dbc9180d313ad8cae57f62ce229610b to your computer and use it in GitHub Desktop.
#!/bin/bash | |
set -e | |
resourceGroupDefault='<set your default here, to avoid having to specify in the common case>' | |
resourceGroup=${RESOURCE_GROUP:-$resourceGroupDefault} | |
clusterNameDefault='<set your default here>' | |
clusterName=${CLUSTER_NAME:-$clusterNameDefault} | |
regionDefault='<set your default here>' | |
region=${REGION:-$regionDefault} | |
force=false | |
dryrun=false | |
nodes='' | |
function print_usage() { | |
echo "Usage: $0 [<options>]" | |
echo "" | |
echo "-n|--node <node> The name of a node to restart." | |
echo " By default, a rolling restart of all nodes" | |
echo " is performed." | |
echo "" | |
echo "--resource-group <group-name> The resource group of the cluster." | |
echo " Can also be set by RESOURCE_GROUP" | |
echo " Default: $resourceGroupDefault" | |
echo "" | |
echo "--cluster-name <cluster-name> The name of the cluster." | |
echo " Can also be set by CLUSTER_NAME" | |
echo " Default: $clusterNameDefault" | |
echo "" | |
echo "--region <azure-region> The Azure region in which the cluster is." | |
echo " Can also be set by REGION" | |
echo " Default: $regionDefault" | |
echo "" | |
echo "-f|--force Restart node(s) without first draining." | |
echo " Useful if draining a node fails." | |
echo "" | |
echo "-d|--dry-run Just print what to do; don't actually do it" | |
echo "" | |
echo "-h|--help Print usage and exit." | |
} | |
while [[ $# -gt 0 ]] | |
do | |
key="$1" | |
case $key in | |
-n|--node) | |
node="$2" | |
shift | |
shift | |
;; | |
--resource-group) | |
resourceGroup="$2" | |
shift | |
shift | |
;; | |
--cluster-name) | |
clusterName="$2" | |
shift | |
shift | |
;; | |
--region) | |
region="$2" | |
shift | |
shift | |
;; | |
-f|--force) | |
force=true | |
shift | |
;; | |
--dry-run) | |
dryrun=true | |
shift | |
;; | |
-h|--help) | |
print_usage | |
exit 0 | |
;; | |
*) | |
print_usage | |
exit 1 | |
;; | |
esac | |
done | |
group="MC_${resourceGroup}_${clusterName}_$region" | |
function wait_for_status() { | |
node=$1 | |
reason=$2 | |
i=0 | |
while [[ $i -lt 30 ]]; do | |
status=$(kubectl get node $node -o "jsonpath={.status.conditions[?(.reason==\"$reason\")].type}") | |
if [[ "$status" == "Ready" ]]; then | |
echo "$reason after $((i*2)) seconds" | |
break; | |
else | |
sleep 2s | |
i=$(($i+1)) | |
fi | |
done | |
if [[ $i == 30 ]]; then | |
echo "Error: Did not reach $reason state within 1 minute" | |
exit 1 | |
fi | |
} | |
if [ -z "$node" ]; then | |
nodes=$(kubectl get nodes -o jsonpath={.items[*].metadata.name}) | |
else | |
nodes="$node" | |
fi | |
for node in $nodes; do | |
if $force; then | |
echo "WARNING: --force specified, restarting node $node without draining first" | |
if $dryrun; then | |
echo "kubectl cordon $node" | |
else | |
kubectl cordon "$node" | |
fi | |
else | |
echo "Draining $node..." | |
if $dryrun; then | |
echo "kubectl drain $node --ignore-daemonsets --delete-local-data" | |
else | |
kubectl drain "$node" --ignore-daemonsets --delete-local-data | |
fi | |
fi | |
echo "Initiating VM restart for $node..." | |
if $dryrun; then | |
echo "az vm restart --resource-group $group --name $node" | |
else | |
az vm restart --resource-group "$group" --name "$node" | |
fi | |
if ! $dryrun; then | |
echo "Waiting for $node to start back up..." | |
wait_for_status $node KubeletNotReady | |
wait_for_status $node KubeletReady | |
fi | |
echo "Re-enabling $node for scheduling" | |
if $dryrun; then | |
echo "kubectl uncordon $node" | |
else | |
kubectl uncordon "$node" | |
fi | |
done |
@gvanriper Note that this script does a rolling restart, i.e. with zero downtime. I assume az aks stop
followed by az aks start
would result in downtime while the cluster is stopped?
Yes, this is a hack, and it's by no means supported in any way by Microsoft. It's also 18 months old, haven't been used (at least by me) since, and was created to solve an instability in AKS at the time, that I sincerely hope is no longer a problem (my nodes were forgetting to share networking capabilities with my containers, so the containers became impossible to reach but the cluster did not detect them as unhealthy).
@tomasaschan, you're correct that it would result in downtime. I just wanted anyone following this that hadn't noticed to know that AKS start/stop is now an option and supported by MS.
Do not do this as it's not supported by MS if something goes wrong. MS now has a start/stop feature. Utilize az aks start/stop.
https://docs.microsoft.com/en-us/azure/aks/start-stop-cluster