Last active
December 4, 2021 13:47
-
-
Save portante/2b91dd7d49636c7e40fa53fb7ed1388b to your computer and use it in GitHub Desktop.
A script to help get fluentd pods running on all labeled nodes of an OpenShift cluster; we need this because Kube currently does not support priority or preemption which we could use to ensure fluentd are always scheduled and run on properly labeled nodes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
function finish { | |
rm -rf $TMPDIR | |
} | |
trap finish EXIT | |
TMPDIR=$(mktemp -d) | |
oc get nodes -o name > $TMPDIR/all-nodes | |
oc get nodes -o name -l logging-infra-fluentd=true > $TMPDIR/labeled-nodes | |
oc get pods -n logging -l component=fluentd -o wide | grep Running > $TMPDIR/fluentd-pods-running | |
let total_nodes=$(wc -l $TMPDIR/all-nodes | awk '{ print $1 }') | |
let total_nodes_labeled=$(wc -l $TMPDIR/labeled-nodes | awk '{ print $1 }') | |
let total_fluentd=$(wc -l $TMPDIR/fluentd-pods-running | awk '{ print $1 }') | |
if [ $total_nodes -ne $total_nodes_labeled ]; then | |
echo "*** Warning *** there are only $total_nodes_labeled of $total_nodes labeled for fluentd" | |
fi | |
if [ $total_nodes_labeled -eq $total_fluentd ]; then | |
echo "Nothing to do, all $total_nodes_labeled labeled nodes have fluentd pods running" | |
exit 0 | |
else | |
let missing=$total_nodes_labeled-$total_fluentd | |
echo "Missing $missing fluentd pods, fixing ..." | |
fi | |
function check_fluentd_state { | |
node=$1 | |
pod=$2 | |
state=$3 | |
if [ "x$node" = "x" -o "x$pod" = "x" -o "x$state" = "x" ]; then | |
echo "Logic bomb! -- expected node (\"$node\"), pod (\"$pod\"), and state (\"$state\") arguments" | |
exit 1 | |
fi | |
if [ "$state" = "Running" ]; then | |
echo "Node $node is now running $pod." | |
let isnowrunning=1 | |
else | |
if [ "$state" = "Pending" -o "$state" = "ContainerCreating" -o "$state" = "Evicted" ]; then | |
let isnowrunning=0 | |
else | |
let isnowrunning=1 | |
echo "** Warning: pod $pod now exists, but is in a bad state, \"$state\", skipping" | |
fi | |
fi | |
return $isnowrunning | |
} | |
function check_fluentd_running_mngnode { | |
node=$1 | |
oc adm manage-node $(basename $node) --list-pods 2> $TMPDIR/mng-node.err | grep -F logging-fluentd > $TMPDIR/mng-node.out | |
if [ -z $TMPDIR/mng-node.out ]; then | |
let isnowrunning=0 | |
else | |
pod=$(grep logging-fluentd $TMPDIR/mng-node.out | awk '{ print $2 }') | |
if [ "x$pod" = "x" ]; then | |
let isnowrunning=0 | |
else | |
state=$(awk '{ print $4 }' $TMPDIR/mng-node.out) | |
check_fluentd_state $node $pod $state | |
let isnowrunning=$? | |
fi | |
fi | |
return $isnowrunning | |
} | |
for node in $(awk '{ print $1 }' $TMPDIR/labeled-nodes); do | |
# Does this labeled node have a running fluentd? | |
let isrunning=$(grep -c -F $(basename $node) $TMPDIR/fluentd-pods-running 2> /dev/null) | |
if [ $isrunning -eq 1 ]; then | |
: # Fluentd pod is already running, nothing to do ... | |
else | |
# Double check it is properly labeled | |
oc describe $node > $TMPDIR/describe-node.out | |
label=$(grep logging-infra-fluentd $TMPDIR/describe-node.out 2> /dev/null | awk '{print $1}') | |
if [ -z "$label" ]; then | |
echo "Logic Bomb! -- Node $(basename $node) missing label logging-infra-fluentd" | |
exit 1 | |
fi | |
if [ ${label#*=} != "true" ]; then | |
echo "Logic Bomb! -- Node $(basename $node) incorrectly labeled with ${label}" | |
exit 1 | |
fi | |
let count=$(grep -c logging-fluentd $TMPDIR/describe-node.out) | |
if [ $count -gt 1 ]; then | |
echo "Logic bomb! -- We have more than one fluentd pod on $node, ($count)" | |
exit 1 | |
fi | |
if [ $count -eq 1 ]; then | |
pod=$(grep logging-fluentd $TMPDIR/describe-node.out | awk '{ print $2 }') | |
state=$(oc get pod -n logging $pod | grep -v "STATUS" | awk '{ print $3 }') | |
check_fluentd_state $node $pod $state | |
let isrunning=$? | |
else | |
let isrunning=0 | |
fi | |
if [ $isrunning -eq 0 ]; then | |
echo "fixing $node ..." | |
oc adm cordon $(basename $node) > $TMPDIR/node-cordoned-state 2>&1 | |
if [ $? -ne 0 ]; then | |
echo "*** Warning *** unable to cordon off $node:" | |
cat $TMPDIR/node-cordoned-state | |
exit 1 | |
else | |
let cordoned=$(grep -c -E " cordoned$" $TMPDIR/node-cordoned-state) | |
if [ $cordoned -ne 1 ]; then | |
echo "*** Warning *** unable to cordon off $node: \"$(cat $TMPDIR/node-cordoned-state)\"" | |
cat $TMPDIR/node-cordoned-state | |
exit 1 | |
fi | |
fi | |
oc adm manage-node $(basename $node) --list-pods > $TMPDIR/mng-node.out 2> $TMPDIR/mng-node.err | |
if [ $? -eq 0 ]; then | |
grep -v 'NAMESPACE' $TMPDIR/mng-node.out > $TMPDIR/pods.lis | |
let count=$(grep -c logging-fluentd $TMPDIR/pods.lis) | |
if [ $count -gt 1 ]; then | |
echo "Logic bomb! -- We have more than one fluentd pod on $node, ($count)" | |
exit 1 | |
fi | |
if [ $count -eq 1 ]; then | |
pod=$(grep logging-fluentd $TMPDIR/pods.lis | awk '{ print $2 }') | |
state=$(grep logging-fluentd $TMPDIR/pods.lis | awk '{ print $4 }') | |
check_fluentd_state $node $pod $state | |
let isnowrunning=$? | |
else | |
let isnowrunning=0 | |
fi | |
if [ $isnowrunning -eq 0 ]; then | |
head -n 5 $TMPDIR/pods.lis | awk '{print "oc delete pod -n " $1 " " $2 }' > $TMPDIR/delete-pods | |
cat $TMPDIR/delete-pods | bash | |
check_fluentd_running_mngnode $node | |
isnowrunning=$? | |
while [ $isnowrunning -eq 0 ]; do | |
echo "Sleeping 5 seconds for fluentd pod to show up..." | |
sleep 5 | |
check_fluentd_running_mngnode $node | |
isnowrunning=$? | |
done | |
fi | |
else | |
echo "*** Warning *** Error listing pods for deletion from $node:" | |
cat $TMPDIR/mng-node.out | |
cat TMPDIR/mng-node.err | |
fi | |
oc adm uncordon $(basename $node) > $TMPDIR/node-cordoned-state 2>&1 | |
if [ $? -ne 0 ]; then | |
echo "*** Warning *** unable to uncordon $node:" | |
cat $TMPDIR/node-cordoned-state | |
exit 1 | |
else | |
let cordoned=$(grep -c -E " uncordoned$" $TMPDIR/node-cordoned-state) | |
if [ $cordoned -ne 1 ]; then | |
echo "*** Warning *** unable to cordon off $node:" | |
cat $TMPDIR/node-cordoned-state | |
exit 1 | |
fi | |
fi | |
fi | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment