Last active
August 29, 2015 14:17
-
-
Save scole-scea/177a367c51d0b93d51f6 to your computer and use it in GitHub Desktop.
Systemd reload hack for fleet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Systemd Reload Hack for coreos fleet | |
After=fleet.service | |
# So: comments outside the script part itself, so I don't exceed | |
# command line length. | |
# | |
# The point here is to detect services which are stuck in "not-found" | |
# state when they shouldn't be. This failure originates from | |
# fleet-submitted unit files, so we try hard not to affect units | |
# outside of fleet control. | |
# | |
# SBL: a list of unit files that Should-Be-Loaded. This is | |
# essentially the output of "fleetctl list-units" | |
# LLF: a list of unit files that seem to have Local-Load-Failure | |
# (local because we're just looking at one host in this script.) | |
# This is generated by looking at "systemctl list-units". | |
# UTR: Units-To-Reload. The list of units we want to recover from the | |
# load-failure state. It's the intersection between the SBL and | |
# LLF lists. | |
# AWT: Associated-With-Timer: The default name of a service that's | |
# associated with a timer unit. (We only handle the default name | |
# case. Parsing the actual timer unit file for more is out of | |
# scope. This *is* supposed to be a hack.) | |
# NTS: Need-To-Start: Any unit files we loaded that are not the | |
# services associated with timers need to get "systemctl | |
# restart"ed. (So: NTS = UTR - AWT.) | |
# | |
# So in words: | |
# | |
# If we detect any fleet-controlled services that are stuck in | |
# load-failure, then we run: | |
# systemctl daemon-reload | |
# to fix the load failure. | |
# | |
# Then for anything that was in load-failure that *should* be | |
# running, we run | |
# systemctl restart <service1> <service2> ... | |
# | |
# Where we define "should be running" as "anything that fleet thinks | |
# should be loaded which isn't a service associated with a timer". | |
# | |
# Obviously, if that definition changes, then this script will need | |
# to be modified. | |
# | |
# In addition, to add to the self-healing, if we find a | |
# fleet-controlled timer in loaded/inactive/dead state, we ask | |
# systemctl to restart it. (This can happen if a timer's associated | |
# service was in the not-found state when the timer went off.) | |
# | |
# This hack is intended to be a workaround for | |
# https://github.com/coreos/fleet/issues/1160 | |
# | |
[Service] | |
Restart=always | |
RestartSec=20s | |
ExecStart=/bin/bash -c "\ | |
function intersection() { \ | |
local keyA keyB; \ | |
for keyA in $1; do \ | |
for keyB in $2; do \ | |
if [ \"$keyA\" == \"$keyB\" ]; then \ | |
echo \"$keyA\"; \ | |
fi; \ | |
done; \ | |
done; \ | |
}; \ | |
function difference() { \ | |
local keyA keyB; \ | |
for keyA in $1; do \ | |
for keyB in $2; do \ | |
if [ \"$keyA\" == \"$keyB\" ]; then \ | |
break; \ | |
fi; \ | |
done; \ | |
if [ \"$keyA\" != \"$keyB\" ]; then \ | |
echo \"$keyA\"; \ | |
fi; \ | |
done; \ | |
}; \ | |
while true; do \ | |
SBL=$(fleetctl list-units -fields unit -no-legend | sort | uniq); \ | |
LLF=$(systemctl -all list-units | sed -n '/.*not-found.*\\(\\(inactive.*dead\\)\\|\\(failed.*failed\\)\\).*/ s/^\\( \\|[^ \\t]\\+ \\)\\([^ \\t]\\+\\)[ \\t].*/\\2/p'); \ | |
UTR=$(intersection \"$SBL\" \"$LLF\"); \ | |
if [ -n \"$UTR\" ]; then \ | |
echo Restarting units: $UTR; \ | |
set -x; \ | |
systemctl daemon-reload; \ | |
AWT=$(fleetctl list-units -fields unit -no-legend | sort | uniq | sed -n 's/\\.timer$/.service/p'); \ | |
NTS=$(difference \"$UTR\" \"$AWT\"); \ | |
if [ -n \"$NTS\" ]; then \ | |
systemctl restart $NTS; \ | |
fi; \ | |
set +x; \ | |
fi; \ | |
LTF=$(systemctl -all list-units | sed -n '/\\.timer \\+loaded.*inactive.*dead/ s/^\\( \\|[^ \\t]\\+ \\)\\([^ \\t]\\+\\)[ \\t].*/\\2/p'); \ | |
UTR=$(intersection \"$SBL\" \"$LTF\"); \ | |
if [ -n \"$UTR\" ]; then \ | |
echo Restarting units: $UTR; \ | |
set -x; \ | |
systemctl restart $UTR; \ | |
set +x; \ | |
fi; \ | |
sleep 120; \ | |
done;" | |
[X-Fleet] | |
Global=true |
This is an update of @sukrit007's original hackish fix at https://gist.github.com/sukrit007/574bfb21fea7bdf1711f
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The service dead issue seems to appear when
Wants
orRequires
appears in the service definition, so I believe this is still ok to manage with fleet. (If that turns out to be false, this needs to move intocloud-init
.)