Created
June 25, 2019 20:18
-
-
Save pracucci/8e8d0eecfa6fa603be9e239e6b5fd396 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# On 2018-08-21 we found out the existance of a bug in the Kubernetes cluster | |
# which leaves stale / dangling cgroups on the system related to secret volume | |
# mounts. | |
# | |
# The root cause of this bug is not clear yet, even if it's likely to be a systemd | |
# bug because the kubelet runs `systemd-run` command to mount secret volumes and, | |
# according to logs, successfully `unmount` such volumes once the pods terminates. | |
# | |
# Systemd should (and sometimes does) stop the systemd unit (and release its cgroups) | |
# once the mount is unmounted, but sometimes it doesn't. For this reason, it may be | |
# a systemd bug. | |
# | |
# Log functions | |
log_info() | |
{ | |
logger -s -p user.notice "cleanup-stale-cgroups.sh: $1" | |
} | |
# List all transient mount units | |
log_info "Listing all systemd units related to 'Kubernetes transient mount' to find stale cgroups to delete" | |
ENTRIES=$(systemctl list-units | grep -E "^[^ ]+ .*Kubernetes transient mount .*/\var/\lib/\kubelet/\pods\/[^\/]+") | |
if [ $? -eq 1 ]; then | |
log_info "No systemd units related to 'Kubernetes transient mount' found" | |
exit 0 | |
fi | |
# Set the newline as field separator so that we can iterate | |
# on lines | |
IFS=$'\n' | |
# For each transient mount systemd unit check if the pod exists. If not, | |
# stop the unit to cleanup the related cgroups. | |
log_info "Checking all systemd units related to 'Kubernetes transient mount' to find stale cgroups to delete" | |
for ENTRY in $ENTRIES; do | |
# Get the unit name and pod path | |
UNIT_NAME=$(echo "$ENTRY" | grep -Eo '^[^ ]+') | |
POD_PATH=$(echo "$ENTRY" | grep -Eo '/\var/\lib/\kubelet/\pods\/[^\/]+') | |
if [ ! -e "$POD_PATH" ]; then | |
log_info "Stopping systemd unit ${UNIT_NAME} related to ${POD_PATH}" | |
systemctl stop $UNIT_NAME | |
fi | |
done | |
log_info "Cleanup successfully completed" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment