Last active
March 12, 2025 05:26
-
-
Save joshuaboniface/733a93313d1801d1922ca73b126073d8 to your computer and use it in GitHub Desktop.
BASH cross-locking sequence script example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
################################################################################################################## | |
# An example of a cross-locking sequence script | |
# | |
# The purpose of this script is to provide a basic framework for what I'll call a "cross-locking sequence script". | |
# For example, let's take my usecase: you have a set of steps that need to be taken to shut down a complex server, | |
# and another (reversed) set of steps taken to bring up the server. This script could then be called by, for | |
# example, NUT (Network UPS Tools) based on UPS power state, and ensure you get a consistent shutdown/startup | |
# process. | |
# | |
# See it in action at https://youtu.be/fv3FSAxrzEE | |
# | |
# Why you'd want the "cross-locking"? Say your shutdown process is 5 steps long: | |
# * Prepare for shutdown | |
# * Shut down application A | |
# * Shut down application B | |
# * Shut down application C | |
# * Terminate the server | |
# And where each step can take quite some time to handle (say each app takes 5 minutes to stop and start). | |
# | |
# You start this process when the UPS battery gets low. Now, there's a couple things that can happen: | |
# 1. The UPS battery gets low, and you start running the steps above. Then the power comes back on. | |
# 2. While the power has come back on, it goes back out again while the battery remains low. | |
# | |
# In both of these cases, you have a situation where you want to stop whatever action is currently happening (for | |
# the shutdown, to stop the shutdown steps at whatever step you are on) and then trigger the opposite action, but | |
# only starting at the last state you were at (for the startup, turning back on the applications you stopped but | |
# not wasting time trying to "start up" ones you didn't stop). | |
# | |
# This script gives you a template to automate those steps while ensuring that, at the end of each step, there's | |
# an opportunity for the current "mode" of the script (shutdown or startup) to end cleanly if the opposite mode's | |
# script is waiting to run, and then have the opposite mode reverse only the steps that it got through, without | |
# wasting time trying to execute reversals of steps that never happened. | |
# | |
# For a concrete example, say we're bringing down the server with the steps above. During the shutdown of | |
# Application B, the power comes back on, so a startup script fires. The startup script will wait for that last | |
# step of the shutdown script to finish, then it acquires the lock. Next, the shutdown script will notice that it | |
# no longer has the proper lock state, so it terminates. And the startup script will know that the last step the | |
# shutdown script was at was the "Shut down Application B", so instead of going through the inverse of "Terminate | |
# the server" and "Start Application C" (however long those might take), it starts right at "Start Application B", | |
# possibly saving you valuable downtime. | |
# | |
# In reality, instead of one server with a bunch of applications, I'm using this to perform a mass shutdown of | |
# dozens of VMs and 2 separate storage clusters, but explaining how *that* works would not be nearly as easy! | |
# | |
# Read on to understand the details and how to customize this script for your own usecase! | |
# | |
# This script passes ShellCheck v0.9.0 | |
################################################################################################################## | |
# You can turn on xtrace here to show the full details of what is being called (for debugging) | |
#set -o xtrace | |
# Define our modes; these are binary, only 2 of them, and one of the must be specified as argument 1 to the script | |
MODE_A="shutdown" | |
MODE_B="startup" | |
# Define the total number of steps; this is the number of functions to run plus 1 | |
TOTAL_STEPS=5 | |
# Define our lock timeout; this should be a little more than twice the time a single function *should* take to run | |
TIMEOUT=30 | |
# Define our state file; for what I'm doing, this must be persistent so I use "/var/spool" for storage | |
STATEFILE="/var/spool/test.state" | |
# Define our wait time in seconds; in this example, this is both the time used inside the functions as well as the | |
# time waiting for locks between functions; you can tweak this as needed | |
WAITTIME=2 | |
# Define our functions; you can use anything you want as these are called by name in each step | |
# These examples simply print something out and wait ${WAITTIME} seconds; if you were to use this script for real, | |
# you'd want to replace these with your own functions that do real things, and probably have separate functions | |
# for each mode (e.g. thing_startup and thing_shutdown functions for the "thing" step). | |
fn1() { | |
echo -n "1... " | |
sleep ${WAITTIME} | |
echo "done." | |
} | |
fn2() { | |
echo -n "2... " | |
sleep ${WAITTIME} | |
echo "done." | |
} | |
fn3() { | |
echo -n "3... " | |
sleep ${WAITTIME} | |
echo "done." | |
} | |
fn4() { | |
echo -n "4... " | |
sleep ${WAITTIME} | |
echo "done." | |
} | |
# Our state file getter and setter functions | |
get_step() { | |
STATE="$( cat $STATEFILE )" | |
echo "${STATE}" | |
} | |
set_step() { | |
echo "$1" > $STATEFILE | |
} | |
# Our acquire_lock function; this is the meat of the script | |
acquire_lock() { | |
# Take in our arguments; the schema is: | |
# acquire_lock {function_to_run} {last_step} {mode_we_are_in} {sequence_we_are_at} | |
FN=${1} | |
LAST_STEP=${2} | |
MODE=${3} | |
SEQ=${4} | |
DO_RUN="" | |
# This complex comparator basically flips what's happening based on which mode we're in | |
# It then checks what step we're at from last step, and decides to run or not depending | |
# on this: | |
# 1. If we're in the same mode, and our last step was less than this step, run it. | |
# 2. If we're in the other mode, and our last step was greater than this step, run it. | |
# 3. Otherwise, do nothing. | |
# 4. If the mode changes on us after running and then waiting, exit the script because | |
# another instance has taken control instead. This is the defining feature of this | |
# script setup, allowing a cross run to cancel the current run and start at the | |
# first valid point in the sequence. | |
if [[ ${MODE} == "${MODE_A}" ]]; then | |
A_SEQ=$(( TOTAL_STEPS - SEQ )) | |
B_SEQ=${SEQ} | |
if [[ ${LAST_STEP%%-*} == "${MODE_B}" && ${LAST_STEP##*-} -ge ${A_SEQ} ]] || \ | |
[[ ${LAST_STEP%%-*} == "${MODE_A}" && ${LAST_STEP##*-} -lt ${B_SEQ} ]]; then | |
DO_RUN="true" | |
fi | |
elif [[ ${MODE} == "${MODE_B}" ]]; then | |
A_SEQ=${SEQ} | |
B_SEQ=$(( TOTAL_STEPS - SEQ )) | |
if [[ ${LAST_STEP%%-*} == "${MODE_A}" && ${LAST_STEP##*-} -ge ${B_SEQ} ]] || \ | |
[[ ${LAST_STEP%%-*} == "${MODE_B}" && ${LAST_STEP##*-} -lt ${A_SEQ} ]]; then | |
DO_RUN="true" | |
fi | |
fi | |
# If we're set to run, acquire the lock, run the function, then update the step | |
if [[ -n ${DO_RUN} ]]; then | |
# Acquire the lock | |
exec {FD}<>$STATEFILE | |
if flock -x -w $TIMEOUT $FD; then | |
# Print that we're executing the given function | |
echo "Executing ${MODE} step ${SEQ} (${FN})" | |
# Run the function | |
${FN} | |
# Update the step | |
set_step "${MODE}-${SEQ}" | |
else | |
# Failed to acquire a lock; some race condition happened, so just exit | |
echo "Failed to acquire lock for ${FN}." | |
exit 0 | |
fi | |
# Release the lock | |
flock -u $FD | |
# Wait a small amount of time for possible cross locks to be acquired | |
echo "Waiting ${WAITTIME}s for locks..." | |
sleep ${WAITTIME} | |
# Recheck what our last step was; if it's different from the expected step we just | |
# set above, then exit | |
if [[ $( get_step ) != "${MODE}-${SEQ}" ]]; then | |
echo "State not expected after waiting; another script has taken control." | |
exit 0 | |
fi | |
fi | |
} | |
# Populate the statefile if it's missing; this first time, the next function will fail to run properly so make | |
# note of that | |
# There's no real way around this due to missing information and the undefined starting state in such a case | |
if [[ ! -f ${STATEFILE} ]]; then | |
touch ${STATEFILE} | |
echo "unknown-0" > ${STATEFILE} | |
fi | |
# Our main case based on which of the two modes was provided; any other mode, and we exit with an error | |
case ${1} in | |
"${MODE_A}") | |
# We try to acquire a lock on the statefile manually; this lets us block waiting for another | |
# copy of the script to finish | |
exec {FD}<>$STATEFILE | |
if flock -x -w $TIMEOUT $FD; then | |
# Here we read whatever the last mode+step in the statefile was | |
LAST_STEP=$( get_step ) | |
LAST_MODE=${LAST_STEP%%-*} | |
LAST_SEQ=${LAST_STEP##*-} | |
if [[ ${LAST_MODE} == "${MODE_A}" ]]; then | |
# If we're in the same mode, just continue from where we left off | |
set_step "${MODE_A}-${LAST_SEQ}" | |
else | |
# If we're in the peer mode, set sequence 0 so we cancel the other job and take control | |
set_step "${MODE_A}-0" | |
fi | |
else | |
echo "Failed to acquire a lock in the specified time." | |
exit 1 | |
fi | |
flock -u $FD | |
# Run fn1 as step 1 | |
# shellcheck disable=SC2086 | |
acquire_lock fn1 ${LAST_STEP} ${MODE_A} 1 | |
# Run fn2 as step 2 | |
# shellcheck disable=SC2086 | |
acquire_lock fn2 ${LAST_STEP} ${MODE_A} 2 | |
# Run fn3 as step 3 | |
# shellcheck disable=SC2086 | |
acquire_lock fn3 ${LAST_STEP} ${MODE_A} 3 | |
# Run fn4 as step 4 | |
# shellcheck disable=SC2086 | |
acquire_lock fn4 ${LAST_STEP} ${MODE_A} 4 | |
# Acquire one last lock and write the final step out manually | |
# Technically, we don't really need this, because the last part of the last acquire_lock wrote a "final" | |
# state, but I like having this for the sense of finality and ensuring that there is a record that the | |
# entire script finished | |
exec {FD}<>$STATEFILE | |
if flock -x -w $TIMEOUT $FD; then | |
set_step "${MODE_A}-${TOTAL_STEPS}" | |
fi | |
flock -u $FD | |
;; | |
"${MODE_B}") | |
# Everything here is exactly like the first case, except MODE_A becomes MODE_B and the functions are | |
# called in reverse, showing how the functions are decoupled from the step logic | |
exec {FD}<>$STATEFILE | |
if flock -x -w $TIMEOUT $FD; then | |
LAST_STEP=$( get_step ) | |
LAST_MODE=${LAST_STEP%%-*} | |
LAST_SEQ=${LAST_STEP##*-} | |
if [[ ${LAST_MODE} == "${MODE_B}" ]]; then | |
set_step "${MODE_B}-${LAST_SEQ}" | |
else | |
set_step "${MODE_B}-0" | |
fi | |
else | |
echo "Failed to acquire a lock in the specified time." | |
exit 1 | |
fi | |
flock -u $FD | |
# shellcheck disable=SC2086 | |
acquire_lock fn4 ${LAST_STEP} ${MODE_B} 1 | |
# shellcheck disable=SC2086 | |
acquire_lock fn3 ${LAST_STEP} ${MODE_B} 2 | |
# shellcheck disable=SC2086 | |
acquire_lock fn2 ${LAST_STEP} ${MODE_B} 3 | |
# shellcheck disable=SC2086 | |
acquire_lock fn1 ${LAST_STEP} ${MODE_B} 4 | |
exec {FD}<>$STATEFILE | |
if flock -x -w $TIMEOUT $FD; then | |
set_step "${MODE_B}-${TOTAL_STEPS}" | |
fi | |
flock -u $FD | |
;; | |
*) | |
# If we get an invalid mode, show what the valid ones are and exit | |
echo "Invalid mode given; our valid modes are:" | |
echo "$MODE_A $MODE_B" | |
exit 1 | |
;; | |
esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment