joshuaboniface · March 12, 2025 05:26
diff --git a/cross-lock-example.sh b/cross-lock-example.sh
 #!/usr/bin/env bash

 ##################################################################################################################
 # An example of a cross-locking sequence script
 #
 # The purpose of this script is to provide a basic framework for what I'll call a "cross-locking sequence script".
 # For example, let's take my usecase: you have a set of steps that need to be taken to shut down a complex server,
 # and another (reversed) set of steps taken to bring up the server. This script could then be called by, for
 # example, NUT (Network UPS Tools) based on UPS power state, and ensure you get a consistent shutdown/startup
 # process.
 #
 # See it in action at https://youtu.be/fv3FSAxrzEE
 #
 # Why you'd want the "cross-locking"? Say your shutdown process is 5 steps long:
 #  * Prepare for shutdown
 #  * Shut down application A
 #  * Shut down application B
 #  * Shut down application C
 #  * Terminate the server
 # And where each step can take quite some time to handle (say each app takes 5 minutes to stop and start).
 #
 # You start this process when the UPS battery gets low. Now, there's a couple things that can happen:
 #  1. The UPS battery gets low, and you start running the steps above. Then the power comes back on.
 #  2. While the power has come back on, it goes back out again while the battery remains low.
 #
 # In both of these cases, you have a situation where you want to stop whatever action is currently happening (for
 # the shutdown, to stop the shutdown steps at whatever step you are on) and then trigger the opposite action, but
 # only starting at the last state you were at (for the startup, turning back on the applications you stopped but
 # not wasting time trying to "start up" ones you didn't stop).
 # 
 # This script gives you a template to automate those steps while ensuring that, at the end of each step, there's
 # an opportunity for the current "mode" of the script (shutdown or startup) to end cleanly if the opposite mode's
 # script is waiting to run, and then have the opposite mode reverse only the steps that it got through, without
 # wasting time trying to execute reversals of steps that never happened.
 #
 # For a concrete example, say we're bringing down the server with the steps above. During the shutdown of
 # Application B, the power comes back on, so a startup script fires. The startup script will wait for that last
 # step of the shutdown script to finish, then it acquires the lock. Next, the shutdown script will notice that it
 # no longer has the proper lock state, so it terminates. And the startup script will know that the last step the
 # shutdown script was at was the "Shut down Application B", so instead of going through the inverse of "Terminate
 # the server" and "Start Application C" (however long those might take), it starts right at "Start Application B", 
 # possibly saving you valuable downtime.
 #
 # In reality, instead of one server with a bunch of applications, I'm using this to perform a mass shutdown of
 # dozens of VMs and 2 separate storage clusters, but explaining how *that* works would not be nearly as easy!
 #
 # Read on to understand the details and how to customize this script for your own usecase!
 #
 # This script passes ShellCheck v0.9.0
 ##################################################################################################################

 # You can turn on xtrace here to show the full details of what is being called (for debugging)
 #set -o xtrace

 # Define our modes; these are binary, only 2 of them, and one of the must be specified as argument 1 to the script
 MODE_A="shutdown"
 MODE_B="startup"

 # Define the total number of steps; this is the number of functions to run plus 1
 TOTAL_STEPS=5

 # Define our lock timeout; this should be a little more than twice the time a single function *should* take to run
 TIMEOUT=30

 # Define our state file; for what I'm doing, this must be persistent so I use "/var/spool" for storage
 STATEFILE="/var/spool/test.state"

 # Define our wait time in seconds; in this example, this is both the time used inside the functions as well as the
 # time waiting for locks between functions; you can tweak this as needed
 WAITTIME=2

 # Define our functions; you can use anything you want as these are called by name in each step
 # These examples simply print something out and wait ${WAITTIME} seconds; if you were to use this script for real,
 # you'd want to replace these with your own functions that do real things, and probably have separate functions
 # for each mode (e.g. thing_startup and thing_shutdown functions for the "thing" step).
 fn1() {
    echo -n "1... "
    sleep ${WAITTIME}
    echo "done."
 }
 fn2() {
    echo -n "2... "
    sleep ${WAITTIME}
    echo "done."
 }
 fn3() {
    echo -n "3... "
    sleep ${WAITTIME}
    echo "done."
 }
 fn4() {
    echo -n "4... "
    sleep ${WAITTIME}
    echo "done."
 }

 # Our state file getter and setter functions
 get_step() {
    STATE="$( cat $STATEFILE )"
    echo "${STATE}"
 }
 set_step() {
    echo "$1" > $STATEFILE
 }

 # Our acquire_lock function; this is the meat of the script
 acquire_lock() {
    # Take in our arguments; the schema is:
    #  acquire_lock {function_to_run} {last_step} {mode_we_are_in} {sequence_we_are_at}
    FN=${1}
    LAST_STEP=${2}
    MODE=${3}
    SEQ=${4}
    DO_RUN=""

    # This complex comparator basically flips what's happening based on which mode we're in
    # It then checks what step we're at from last step, and decides to run or not depending
    # on this:
    #   1. If we're in the same mode, and our last step was less than this step, run it.
    #   2. If we're in the other mode, and our last step was greater than this step, run it.
    #   3. Otherwise, do nothing.
    #   4. If the mode changes on us after running and then waiting, exit the script because
    #      another instance has taken control instead. This is the defining feature of this
    #      script setup, allowing a cross run to cancel the current run and start at the
    #      first valid point in the sequence.
    if [[ ${MODE} == "${MODE_A}" ]]; then
        A_SEQ=$(( TOTAL_STEPS - SEQ ))
        B_SEQ=${SEQ}
        if [[ ${LAST_STEP%%-*} == "${MODE_B}" && ${LAST_STEP##*-} -ge ${A_SEQ} ]] || \
           [[ ${LAST_STEP%%-*} == "${MODE_A}" && ${LAST_STEP##*-} -lt ${B_SEQ} ]]; then
               DO_RUN="true"
        fi
    elif [[ ${MODE} == "${MODE_B}" ]]; then
        A_SEQ=${SEQ}
        B_SEQ=$(( TOTAL_STEPS - SEQ ))
        if [[ ${LAST_STEP%%-*} == "${MODE_A}" && ${LAST_STEP##*-} -ge ${B_SEQ} ]] || \
           [[ ${LAST_STEP%%-*} == "${MODE_B}" && ${LAST_STEP##*-} -lt ${A_SEQ} ]]; then
               DO_RUN="true"
        fi
    fi

    # If we're set to run, acquire the lock, run the function, then update the step
    if [[ -n ${DO_RUN} ]]; then
        # Acquire the lock
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            # Print that we're executing the given function
            echo "Executing ${MODE} step ${SEQ} (${FN})"
            # Run the function
            ${FN}
            # Update the step
            set_step "${MODE}-${SEQ}"
        else
            # Failed to acquire a lock; some race condition happened, so just exit
            echo "Failed to acquire lock for ${FN}."
            exit 0
        fi
        # Release the lock
        flock -u $FD

        # Wait a small amount of time for possible cross locks to be acquired
        echo "Waiting ${WAITTIME}s for locks..."
        sleep ${WAITTIME}

        # Recheck what our last step was; if it's different from the expected step we just
        # set above, then exit
        if [[ $( get_step ) != "${MODE}-${SEQ}" ]]; then
            echo "State not expected after waiting; another script has taken control."
            exit 0
        fi
    fi
 }

 # Populate the statefile if it's missing; this first time, the next function will fail to run properly so make
 # note of that
 # There's no real way around this due to missing information and the undefined starting state in such a case
 if [[ ! -f ${STATEFILE} ]]; then
    touch ${STATEFILE}
    echo "unknown-0" > ${STATEFILE}
 fi

 # Our main case based on which of the two modes was provided; any other mode, and we exit with an error
 case ${1} in
    "${MODE_A}")
        # We try to acquire a lock on the statefile manually; this lets us block waiting for another
        # copy of the script to finish 
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            # Here we read whatever the last mode+step in the statefile was
            LAST_STEP=$( get_step )
            LAST_MODE=${LAST_STEP%%-*}
            LAST_SEQ=${LAST_STEP##*-}

            if [[ ${LAST_MODE} == "${MODE_A}" ]]; then
                # If we're in the same mode, just continue from where we left off
                set_step "${MODE_A}-${LAST_SEQ}"
            else
                # If we're in the peer mode, set sequence 0 so we cancel the other job and take control
                set_step "${MODE_A}-0"
            fi
        else
            echo "Failed to acquire a lock in the specified time."
            exit 1
        fi
        flock -u $FD

        # Run fn1 as step 1
        # shellcheck disable=SC2086
        acquire_lock fn1 ${LAST_STEP} ${MODE_A} 1
        # Run fn2 as step 2
        # shellcheck disable=SC2086
        acquire_lock fn2 ${LAST_STEP} ${MODE_A} 2
        # Run fn3 as step 3
        # shellcheck disable=SC2086
        acquire_lock fn3 ${LAST_STEP} ${MODE_A} 3
        # Run fn4 as step 4
        # shellcheck disable=SC2086
        acquire_lock fn4 ${LAST_STEP} ${MODE_A} 4

        # Acquire one last lock and write the final step out manually
        # Technically, we don't really need this, because the last part of the last acquire_lock wrote a "final"
        # state, but I like having this for the sense of finality and ensuring that there is a record that the
        # entire script finished
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            set_step "${MODE_A}-${TOTAL_STEPS}"
        fi
        flock -u $FD
    ;;
    "${MODE_B}")
        # Everything here is exactly like the first case, except MODE_A becomes MODE_B and the functions are
        # called in reverse, showing how the functions are decoupled from the step logic
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            LAST_STEP=$( get_step )
            LAST_MODE=${LAST_STEP%%-*}
            LAST_SEQ=${LAST_STEP##*-}

            if [[ ${LAST_MODE} == "${MODE_B}" ]]; then
                set_step "${MODE_B}-${LAST_SEQ}"
            else
                set_step "${MODE_B}-0"
            fi
        else
            echo "Failed to acquire a lock in the specified time."
            exit 1
        fi
        flock -u $FD
        
        # shellcheck disable=SC2086
        acquire_lock fn4 ${LAST_STEP} ${MODE_B} 1
        # shellcheck disable=SC2086
        acquire_lock fn3 ${LAST_STEP} ${MODE_B} 2
        # shellcheck disable=SC2086
        acquire_lock fn2 ${LAST_STEP} ${MODE_B} 3
        # shellcheck disable=SC2086
        acquire_lock fn1 ${LAST_STEP} ${MODE_B} 4

        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            set_step "${MODE_B}-${TOTAL_STEPS}"
        fi
        flock -u $FD
    ;;
    *)
        # If we get an invalid mode, show what the valid ones are and exit
        echo "Invalid mode given; our valid modes are:"
        echo "$MODE_A $MODE_B"
        exit 1
    ;;
 esac
	#!/usr/bin/env bash

	##################################################################################################################
	# An example of a cross-locking sequence script
	#
	# The purpose of this script is to provide a basic framework for what I'll call a "cross-locking sequence script".
	# For example, let's take my usecase: you have a set of steps that need to be taken to shut down a complex server,
	# and another (reversed) set of steps taken to bring up the server. This script could then be called by, for
	# example, NUT (Network UPS Tools) based on UPS power state, and ensure you get a consistent shutdown/startup
	# process.
	#
	# See it in action at https://youtu.be/fv3FSAxrzEE
	#
	# Why you'd want the "cross-locking"? Say your shutdown process is 5 steps long:
	# * Prepare for shutdown
	# * Shut down application A
	# * Shut down application B
	# * Shut down application C
	# * Terminate the server
	# And where each step can take quite some time to handle (say each app takes 5 minutes to stop and start).
	#
	# You start this process when the UPS battery gets low. Now, there's a couple things that can happen:
	# 1. The UPS battery gets low, and you start running the steps above. Then the power comes back on.
	# 2. While the power has come back on, it goes back out again while the battery remains low.
	#
	# In both of these cases, you have a situation where you want to stop whatever action is currently happening (for
	# the shutdown, to stop the shutdown steps at whatever step you are on) and then trigger the opposite action, but
	# only starting at the last state you were at (for the startup, turning back on the applications you stopped but
	# not wasting time trying to "start up" ones you didn't stop).
	#
	# This script gives you a template to automate those steps while ensuring that, at the end of each step, there's
	# an opportunity for the current "mode" of the script (shutdown or startup) to end cleanly if the opposite mode's
	# script is waiting to run, and then have the opposite mode reverse only the steps that it got through, without
	# wasting time trying to execute reversals of steps that never happened.
	#
	# For a concrete example, say we're bringing down the server with the steps above. During the shutdown of
	# Application B, the power comes back on, so a startup script fires. The startup script will wait for that last
	# step of the shutdown script to finish, then it acquires the lock. Next, the shutdown script will notice that it
	# no longer has the proper lock state, so it terminates. And the startup script will know that the last step the
	# shutdown script was at was the "Shut down Application B", so instead of going through the inverse of "Terminate
	# the server" and "Start Application C" (however long those might take), it starts right at "Start Application B",
	# possibly saving you valuable downtime.
	#
	# In reality, instead of one server with a bunch of applications, I'm using this to perform a mass shutdown of
	# dozens of VMs and 2 separate storage clusters, but explaining how that works would not be nearly as easy!
	#
	# Read on to understand the details and how to customize this script for your own usecase!
	#
	# This script passes ShellCheck v0.9.0
	##################################################################################################################

	# You can turn on xtrace here to show the full details of what is being called (for debugging)
	#set -o xtrace

	# Define our modes; these are binary, only 2 of them, and one of the must be specified as argument 1 to the script
	MODE_A="shutdown"
	MODE_B="startup"

	# Define the total number of steps; this is the number of functions to run plus 1
	TOTAL_STEPS=5

	# Define our lock timeout; this should be a little more than twice the time a single function should take to run
	TIMEOUT=30

	# Define our state file; for what I'm doing, this must be persistent so I use "/var/spool" for storage
	STATEFILE="/var/spool/test.state"

	# Define our wait time in seconds; in this example, this is both the time used inside the functions as well as the
	# time waiting for locks between functions; you can tweak this as needed
	WAITTIME=2

	# Define our functions; you can use anything you want as these are called by name in each step
	# These examples simply print something out and wait ${WAITTIME} seconds; if you were to use this script for real,
	# you'd want to replace these with your own functions that do real things, and probably have separate functions
	# for each mode (e.g. thing_startup and thing_shutdown functions for the "thing" step).
	fn1() {
	echo -n "1... "
	sleep ${WAITTIME}
	echo "done."
	}
	fn2() {
	echo -n "2... "
	sleep ${WAITTIME}
	echo "done."
	}
	fn3() {
	echo -n "3... "
	sleep ${WAITTIME}
	echo "done."
	}
	fn4() {
	echo -n "4... "
	sleep ${WAITTIME}
	echo "done."
	}

	# Our state file getter and setter functions
	get_step() {
	STATE="$( cat $STATEFILE )"
	echo "${STATE}"
	}
	set_step() {
	echo "$1" > $STATEFILE
	}

	# Our acquire_lock function; this is the meat of the script
	acquire_lock() {
	# Take in our arguments; the schema is:
	# acquire_lock {function_to_run} {last_step} {mode_we_are_in} {sequence_we_are_at}
	FN=${1}
	LAST_STEP=${2}
	MODE=${3}
	SEQ=${4}
	DO_RUN=""

	# This complex comparator basically flips what's happening based on which mode we're in
	# It then checks what step we're at from last step, and decides to run or not depending
	# on this:
	# 1. If we're in the same mode, and our last step was less than this step, run it.
	# 2. If we're in the other mode, and our last step was greater than this step, run it.
	# 3. Otherwise, do nothing.
	# 4. If the mode changes on us after running and then waiting, exit the script because
	# another instance has taken control instead. This is the defining feature of this
	# script setup, allowing a cross run to cancel the current run and start at the
	# first valid point in the sequence.
	if [[ ${MODE} == "${MODE_A}" ]]; then
	A_SEQ=$(( TOTAL_STEPS - SEQ ))
	B_SEQ=${SEQ}
	if [[ ${LAST_STEP%%-} == "${MODE_B}" && ${LAST_STEP##-} -ge ${A_SEQ} ]] \|\| \
	[[ ${LAST_STEP%%-} == "${MODE_A}" && ${LAST_STEP##-} -lt ${B_SEQ} ]]; then
	DO_RUN="true"
	fi
	elif [[ ${MODE} == "${MODE_B}" ]]; then
	A_SEQ=${SEQ}
	B_SEQ=$(( TOTAL_STEPS - SEQ ))
	if [[ ${LAST_STEP%%-} == "${MODE_A}" && ${LAST_STEP##-} -ge ${B_SEQ} ]] \|\| \
	[[ ${LAST_STEP%%-} == "${MODE_B}" && ${LAST_STEP##-} -lt ${A_SEQ} ]]; then
	DO_RUN="true"
	fi
	fi

	# If we're set to run, acquire the lock, run the function, then update the step
	if [[ -n ${DO_RUN} ]]; then
	# Acquire the lock
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	# Print that we're executing the given function
	echo "Executing ${MODE} step ${SEQ} (${FN})"
	# Run the function
	${FN}
	# Update the step
	set_step "${MODE}-${SEQ}"
	else
	# Failed to acquire a lock; some race condition happened, so just exit
	echo "Failed to acquire lock for ${FN}."
	exit 0
	fi
	# Release the lock
	flock -u $FD

	# Wait a small amount of time for possible cross locks to be acquired
	echo "Waiting ${WAITTIME}s for locks..."
	sleep ${WAITTIME}

	# Recheck what our last step was; if it's different from the expected step we just
	# set above, then exit
	if [[ $( get_step ) != "${MODE}-${SEQ}" ]]; then
	echo "State not expected after waiting; another script has taken control."
	exit 0
	fi
	fi
	}

	# Populate the statefile if it's missing; this first time, the next function will fail to run properly so make
	# note of that
	# There's no real way around this due to missing information and the undefined starting state in such a case
	if [[ ! -f ${STATEFILE} ]]; then
	touch ${STATEFILE}
	echo "unknown-0" > ${STATEFILE}
	fi

	# Our main case based on which of the two modes was provided; any other mode, and we exit with an error
	case ${1} in
	"${MODE_A}")
	# We try to acquire a lock on the statefile manually; this lets us block waiting for another
	# copy of the script to finish
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	# Here we read whatever the last mode+step in the statefile was
	LAST_STEP=$( get_step )
	LAST_MODE=${LAST_STEP%%-*}
	LAST_SEQ=${LAST_STEP##*-}

	if [[ ${LAST_MODE} == "${MODE_A}" ]]; then
	# If we're in the same mode, just continue from where we left off
	set_step "${MODE_A}-${LAST_SEQ}"
	else
	# If we're in the peer mode, set sequence 0 so we cancel the other job and take control
	set_step "${MODE_A}-0"
	fi
	else
	echo "Failed to acquire a lock in the specified time."
	exit 1
	fi
	flock -u $FD

	# Run fn1 as step 1
	# shellcheck disable=SC2086
	acquire_lock fn1 ${LAST_STEP} ${MODE_A} 1
	# Run fn2 as step 2
	# shellcheck disable=SC2086
	acquire_lock fn2 ${LAST_STEP} ${MODE_A} 2
	# Run fn3 as step 3
	# shellcheck disable=SC2086
	acquire_lock fn3 ${LAST_STEP} ${MODE_A} 3
	# Run fn4 as step 4
	# shellcheck disable=SC2086
	acquire_lock fn4 ${LAST_STEP} ${MODE_A} 4

	# Acquire one last lock and write the final step out manually
	# Technically, we don't really need this, because the last part of the last acquire_lock wrote a "final"
	# state, but I like having this for the sense of finality and ensuring that there is a record that the
	# entire script finished
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	set_step "${MODE_A}-${TOTAL_STEPS}"
	fi
	flock -u $FD
	;;
	"${MODE_B}")
	# Everything here is exactly like the first case, except MODE_A becomes MODE_B and the functions are
	# called in reverse, showing how the functions are decoupled from the step logic
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	LAST_STEP=$( get_step )
	LAST_MODE=${LAST_STEP%%-*}
	LAST_SEQ=${LAST_STEP##*-}

	if [[ ${LAST_MODE} == "${MODE_B}" ]]; then
	set_step "${MODE_B}-${LAST_SEQ}"
	else
	set_step "${MODE_B}-0"
	fi
	else
	echo "Failed to acquire a lock in the specified time."
	exit 1
	fi
	flock -u $FD

	# shellcheck disable=SC2086
	acquire_lock fn4 ${LAST_STEP} ${MODE_B} 1
	# shellcheck disable=SC2086
	acquire_lock fn3 ${LAST_STEP} ${MODE_B} 2
	# shellcheck disable=SC2086
	acquire_lock fn2 ${LAST_STEP} ${MODE_B} 3
	# shellcheck disable=SC2086
	acquire_lock fn1 ${LAST_STEP} ${MODE_B} 4

	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	set_step "${MODE_B}-${TOTAL_STEPS}"
	fi
	flock -u $FD
	;;
	*)
	# If we get an invalid mode, show what the valid ones are and exit
	echo "Invalid mode given; our valid modes are:"
	echo "$MODE_A $MODE_B"
	exit 1
	;;
	esac