Last active
June 14, 2018 14:08
-
-
Save jbouzekri/11c56ed5ebb038e6166300ed294630e7 to your computer and use it in GitHub Desktop.
detach azure disks used in kubernetes pods errored because of disk related issues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if [ "$#" -ne 4 ]; then | |
echo "Illegal number of parameters" | |
echo "" | |
echo "Usage: ./fix-azure-disk-double-mount.sh <azure subscription id> <azure resource_group> <kube context> <kube namespace>" | |
echo "" | |
echo "Find all pods in error, checks if it is related to an Azure disk issue. Then detach from azure VM these disks. It should trigger a resync automatically on kube side. Run the scripts multiple times if the first time is not the one." | |
exit 1 | |
fi | |
KUBE_NS=$4 | |
AZURE_RG=$2 | |
AZURE_SUB_ID=$1 | |
KUBE_CTX=$3 | |
DISKS_MAPPING="" | |
ALL_DISKS="" | |
NL=$'\n' | |
KUBE_FAILED_DISKS="" | |
# First check that mandatory tools are here | |
command -v jq >/dev/null 2>&1 || { echo >&2 "jq not installed. Aborting."; exit 1; } | |
AZ_BIN=$(which az.cmd 2> /dev/null) | |
if [[ $? -eq 1 ]] ; then | |
AZ_BIN=$(which az 2> /dev/null) | |
if [[ $? -eq 1 ]] ; then | |
echo "az cli not found" | |
exit 1 | |
fi | |
fi | |
KUBE_BIN=$(which kubectl 2> /dev/null) | |
if [[ $? -eq 1 ]] ; then | |
echo "kubectl not found" | |
exit 1 | |
fi | |
echo "" | |
echo "using \"$AZ_BIN\" binary" | |
echo "using \"$KUBE_BIN\" binary" | |
# Switch contexts in azure and kube | |
echo "" | |
echo "Switching to azure subscription $AZURE_SUB_ID ..." | |
"$AZ_BIN" account set --subscription $AZURE_SUB_ID | |
echo "Switched to azure subscription \"$AZURE_SUB_ID\"." | |
echo "" | |
echo "Switching to context $KUBE_CTX ..." | |
kubectl config use-context $KUBE_CTX | |
# Find all failed pods | |
echo "" | |
echo "Loading failed pods from context $KUBE_CTX and namespace $KUBE_NS ..." | |
PODS_LIST=$(kubectl get pods -n $KUBE_NS | grep -vi "STATUS" | grep -vi "Running\|Completed" | awk '{print $1}') | |
if [ -z "$PODS_LIST" ]; then | |
echo "no failed pods found" | |
exit 0 | |
fi | |
echo "" | |
echo "Failed pods found in context $KUBE_CTX and namespace $KUBE_NS :" | |
echo "$PODS_LIST" | sed -e 's/^/\* /' | |
# For each failed pod, find the ones with describe messages related to Azure disk attachment problems | |
# and extract failed disks name | |
for pod_name in $PODS_LIST; do | |
KUBE_DISKS=$("$KUBE_BIN" describe pod $pod_name -n $KUBE_NS | grep "Blob is already in use" | grep -o "disk '.*' using" | grep -o "'.*'" | tr -d "'") | |
if [ -z "$KUBE_DISKS" ]; then | |
KUBE_DISKS=$("$KUBE_BIN" describe pod $pod_name -n $KUBE_NS | grep "AttachDiskWhileBeingDetached" | grep -o "disk '.*' to" | grep -o "'.*'" | tr -d "'") | |
fi | |
# Sometimes the error is not related to disk attachment error | |
if [ -z "$KUBE_DISKS" ]; then | |
continue | |
fi | |
if [ -z "$KUBE_FAILED_DISKS" ]; then | |
KUBE_FAILED_DISKS="$KUBE_DISKS" | |
else | |
KUBE_FAILED_DISKS="${KUBE_FAILED_DISKS}${NL}${KUBE_DISKS}" | |
fi | |
done | |
KUBE_FAILED_DISKS=$(echo "$KUBE_FAILED_DISKS" | sort | uniq) | |
echo "" | |
echo "Failed disks found in context $KUBE_CTX and namespace $KUBE_NS :" | |
echo "$KUBE_FAILED_DISKS" | sed -e 's/^/\* /' | |
# Load all disks in all VMs | |
echo "" | |
echo "Loading VM list from resource group \"$AZURE_RG\" ..." | |
VMS_LIST=$("$AZ_BIN" vm list -g $AZURE_RG | jq -r '.[] | .name' | grep agent) | |
if [ -z "$VMS_LIST" ]; then | |
echo "no VM found in this resource group" | |
exit 1 | |
fi | |
echo "" | |
echo "VM found in resources group $AZURE_RG:" | |
echo "$VMS_LIST" | sed -e 's/^/\* /' | |
echo "" | |
for vm_name in $VMS_LIST; do | |
echo "Loading disks for VM $vm_name ..." | |
DISKS=$("$AZ_BIN" vm get-instance-view --name $vm_name -g $AZURE_RG | jq -r '.storageProfile.dataDisks[] | .name') | |
for disk in $DISKS; do | |
if [ -z "$DISKS_MAPPING" ]; then | |
DISKS_MAPPING="$disk $vm_name" | |
else | |
DISKS_MAPPING="${DISKS_MAPPING}${NL}$disk $vm_name" | |
fi | |
done | |
done | |
# For each failed disks, detach them in all the VMs we found them on | |
for failed_disk in $KUBE_FAILED_DISKS; do | |
VMS_WITH_THIS_DISK=$(echo "$DISKS_MAPPING" | grep $failed_disk | cut -d' ' -f2) | |
if [ -z "$VMS_WITH_THIS_DISK" ]; then | |
echo "" | |
echo "Disk $failed_disk is not mounted on any VMs" | |
continue | |
fi | |
echo "" | |
echo "Disk $failed_disk found in VMs :" | |
echo "$VMS_WITH_THIS_DISK" | sed -e 's/^/\* /' | |
for vm_name in $VMS_WITH_THIS_DISK; do | |
echo "attempting to detach disk $failed_disk from $vm_name in resources group $AZURE_RG ..." | |
"$AZ_BIN" vm disk detach --name $failed_disk -g $AZURE_RG --vm-name $vm_name | |
done | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment