Created
December 7, 2021 07:11
-
-
Save aivanise/5414924f1bbcf84cbdff0a26916b75ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# automatic (linux) router failover script | |
# pings around and changes the default route to backup | |
# if all the sites are not available for TIMEOUT seconds | |
# source the script to get useful control functions (type help for help) | |
# run it every minute or so to fail the default route over automatically | |
# depoending on the availability of the IPs | |
# IPS: put the list of IP addresses to be pinged prefixed by #IP, only the second field is looked at, you can put the comment in the rest | |
#IP 194.25.0.60 resolv-h.dtag.de | |
#IP 194.25.0.68 resolv-f.dtag.de | |
#IP 194.25.0.52 resolv-l.dtag.de | |
#IP 109.234.111.81 domaindiscount24.com | |
#IP 185.194.238.252 ip-projects.de | |
#IP 104.244.42.65 twitter.com | |
#IP 8.8.8.8 google.dns1 | |
#IP 8.8.4.4 google.dns2 | |
#IP 62.138.238.100 t-online.de.1 | |
#IP 62.138.239.100 t-online.de.2 | |
#IP 1.1.1.1 cloudflare dns | |
[[ -z "$TIMEOUT" ]] && TIMEOUT=30 # wait so many seconds to make sure the line is down | |
# pipe to this to log (default syslog) | |
LOGGER="logger -t failover -p local3.info" | |
# routes definition: | |
# first one is main, second one is backup | |
ROUTES="87.130.124.57 192.168.2.253" | |
# interfaces routes live on (i.e. eth0 eth1, can be the same) | |
INTERFACES="ftto vdsl" | |
# router internal ip, if you have a router cluster, whoever owns this IP is the active member | |
ROUTERIP=192.168.220.254 | |
routes=( $ROUTES ) | |
MAINROUTE=${routes[0]} | |
BACKUPROUTE=${routes[1]} | |
ifaces=( $INTERFACES ) | |
MAINIFACE=${routes[0]} | |
BACKUPIFACE=${routes[1]} | |
# do not fail back earlier than this | |
FAILOVER_THRESHOLD_SEC=600 | |
#### no user serviceable parts below this line (ahahah) | |
# are we root or not? | |
[[ $(id -u) != 0 ]] && SUDO=sudo | |
# get a device for a route, hardcoded for now | |
function getdev { | |
case "$1" in | |
$MAINROUTE) echo $MAINIFACE ;; | |
$BACKUPROUTE) echo $BACKUPIFACE ;; | |
*) echo unkn ;; | |
esac | |
} | |
# check a route by pinging a list of IPs (above) | |
# parameter: one of the IPs from the $ROUTES variable above | |
function check_route { | |
# check a few ips over route r | |
# returns: count of unreachable hosts or -1 for linkdown | |
local route=$1 | |
dev=$(getdev $route) | |
gw="via $route dev $dev onlink" | |
script=$0 | |
[[ "${BASH_SOURCE[0]}" != "${0}" ]] && script=${BASH_SOURCE[0]}; | |
IPS=$(grep '^#IP' $script | cut -d' ' -f 2 ) #TEST | awk '{print "1.2.3.4"}'); | |
IPS_CNT=$(echo $IPS | wc -w) | |
# now test | |
if [[ "$gw" ]] && /sbin/ip link show dev $dev 2> /dev/null | grep -q UP; then | |
# make sure that all test IPs are rerouted through the device | |
for cip in $IPS; do | |
$SUDO /sbin/ip route replace $cip $gw 2>/dev/null || { echo $IPS_CNT; return; }; | |
done | |
UNR=$(/usr/bin/fping -u $IPS 2>/dev/null | xargs echo) # check | |
UNR_CNT=$(echo $UNR | wc -w) # how many are down? | |
LEFT=$(($IPS_CNT - $UNR_CNT)) # how many are up? | |
else | |
# main device is down, so fake all unreachable | |
UNR=$IPS | |
UNR_CNT=$IPS_CNT | |
LEFT=0 | |
fi | |
# clean up the routes | |
for cip in $IPS; do | |
[[ "$gw" ]] && $SUDO /sbin/ip route delete $cip $gw 2>/dev/null | |
done | |
# debug | |
if [[ "$LEFT" -lt 3 ]]; then | |
echo "$route down, unreachable $UNR" | $LOGGER | |
else | |
if [[ $UNR_CNT -gt 0 ]]; then | |
( echo -n the following is still unreachable, tweak the list of hosts : | |
for host in $UNR; do echo -n $(grep $host $script); done; echo ) | $LOGGER | |
fi | |
fi | |
echo "$UNR_CNT" | |
} | |
function route_active { | |
# is route active anywhere? | |
local route="$1" | |
/sbin/ip r s default | fgrep -q $route | |
} | |
function route_reachable { | |
# is the route reachable? | |
local route="$1" | |
unr=$(check_route $route) | |
if [[ "$unr" -gt 3 ]] || [[ "$unr" -eq -1 ]]; then # recheck after TIMEOUT to make sure | |
sleep $TIMEOUT | |
unr=$(check_route $route) | |
fi | |
if [[ "$unr" -gt 3 ]] || [[ "$unr" -eq -1 ]]; then | |
return 1 | |
else | |
return 0 | |
fi | |
} | |
# fail back to main route | |
function failback { | |
if ! route_active $MAINROUTE || [[ "$force" ]]; then | |
$SUDO /sbin/ip r r default via $MAINROUTE dev $(getdev $MAINROUTE) onlink | |
fi | |
echo failback done | $LOGGER | |
} | |
# fail over to backup route | |
function failover { | |
if ! route_active $BACKUPROUTE || [[ "$force" ]]; then | |
$SUDO /sbin/ip r r default via $BACKUPROUTE dev $(getdev $BACKUPROUTE) onlink | |
fi | |
echo failover done | $LOGGER | |
} | |
function post_failover { | |
# tidy up after failover | |
$SUDO touch /tmp/last_failover | |
$SUDO chmod 777 /tmp/last_failover | |
lines | |
} | |
function lines { | |
echo default route for internal network: $(/sbin/ip r s | fgrep default) | |
[[ -f /tmp/no_failback ]] && echo "/tmp/no_failback is set" | |
[[ -f /tmp/last_failover ]] && echo "last failover (/tmp/last_failover) was at " $(stat --format +%z /tmp/last_failover | cut -d. -f 1) | |
echo last 6 events: | |
$SUDO fgrep failover /var/log/syslog | tail -6 | |
} | |
# return MAIN or BACKUP or UNKNOWN depending on where the default route goes | |
function linkstate { | |
route=$(ip r s default | awk '{print $3}') | |
case "$route" in | |
$MAINROUTE) echo "MAIN"; return 0;; | |
$BACKUPROUTE) echo "BACKUP"; return 1;; | |
*) echo UNKNOWN;; | |
esac | |
} | |
function getstate { | |
# get network state as a bitmap, to be used in the decision tree below | |
route_reachable $MAINROUTE && printf 1 || printf 0 | |
route_reachable $BACKUPROUTE && printf 1 || printf 0 | |
route_active $MAINROUTE && printf 1 || printf 0 | |
route_active $BACKUPROUTE && printf 1 || printf 0 | |
[[ -f /tmp/no_failback ]] && printf 1 || printf 0 | |
} | |
function help { | |
cat<<EOF | |
internet lines commands: | |
lines - human readable link state | |
linkstate - simple link state MAIN/BACKUP/UNKNOWN | |
getstate - binary link state | |
failover - fail over to backuo line | |
failback - fail back to main line | |
check_route | |
$MAINROUTE| | |
$BACKUPROUTE - 0 is OK, more than 3 is bad | |
touch /tmp/last_failover - prevent further failovers for $FAILOVER_THRESHOLD_SEC seconds | |
touch /tmp/no_failback - prevent failbacks forever | |
EOF | |
} | |
######## main program ########################### | |
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then | |
[[ -t 0 ]] && printf "\nsystem state: $(systemctl is-system-running)\nlink state: $(linkstate)\n *** type help for more commands ***\n\n" | |
return | |
fi | |
if [[ "$1" == "linkstate" ]]; then | |
linkstate | |
elif [[ "$1" == "rebalancevpn" ]]; then | |
rebalancevpn $2 $3 | |
elif [[ "$1" == "vpnips" ]]; then | |
vpnips $2 $3 | |
else | |
# cleanup old failovers that might be stuck due to network outages | |
PIDS=$(pgrep -f failover.sh) | |
for pid in $PIDS; do | |
# kill only instances without parameters (i.e. no monitoring) | |
if [[ $pid != $$ ]] && [[ -f /proc/$pid/cmdline ]] && [[ $(xargs -n 1 -0 echo 2> /dev/null < /proc/$pid/cmdline | wc -l) -eq 1 ]]; then | |
$SUDO kill -9 $pid 2>/dev/null | |
fi | |
done | |
#### main loop ##### | |
# if we are not the master router, just set the default route to the master and do nothing | |
if ! ip addr show | fgrep -q $ROUTERIP/; then | |
if ! ip route show default | fgrep -q $ROUTERIP; then | |
ip route replace default via $ROUTERIP | |
echo "We are not the master, setting route to master and bailing out" | $LOGGER | |
fi | |
exit | |
fi | |
# don't flap too often | |
if [[ -f /tmp/last_failover ]] && [[ $(( $(date +%s) - $(stat /tmp/last_failover -c %Z) )) -lt $FAILOVER_THRESHOLD_SEC ]]; then | |
echo last failover was less than $FAILOVER_THRESHOLD_SEC s ago, skip checks | $LOGGER | |
else | |
# where are we? also record for automatic failback | |
[[ -z "$state" ]] && state=$(getstate) | |
echo "state: $state" | $LOGGER | |
$SUDO chown root:root /tmp/linkstates | |
$SUDO chmod 777 /tmp/linkstates | |
$SUDO bash -c "echo $(date +%s) $state >> /tmp/linkstates" | |
# decision table | |
# see function getstate() above for the meaning of the bits | |
# 10110 01110 01111 10111 11110 11111 - both active, impossible - do nothing | |
# 11100 11101 10100 10101 - main route reachable and active - do nothing | |
# 01011 - backup active and reachable , no failback - do nothing | |
# 11011 - backup active, main reachable, consider auto failback | |
# 01010 - same but failback not possible - do nothing | |
# 00000 00001 00010 00011 00100 00101 00110 00111 - nothing reachable, bummer - do nothing | |
# 11010 - main reachable, backup active - failback | |
# 11000 11001 10000 10001 - nothing active, main reachable - failback | |
# 10010 10011 - backup active but main reachable - failback | |
# 01000 01001 - nothing active, backup reachable - failover | |
# 01100 01101 - main down, backup up - failover | |
# for i in $(seq 0 31); do a="0000"$(bc <<< "obase=2; $i"); echo ${a:${#a}-5:5}; done | |
case "$state" in | |
# failover | |
01000|01001|01100|01101) failover; $SUDO touch /tmp/no_failback; $SUDO chmod 777 /tmp/no_failback; post_failover ;; | |
# failback | |
11010|11000|11001|10000|10001|10010|10011) $SUDO rm -f /tmp/no_failback; failback; post_failover ;; | |
# recent failover, consider auto failback? | |
11011) | |
# were we in the same state for at least an hour? | |
fbok=$(awk -vts=$(date -d "60 min ago" +%s) 'BEGIN { cnt=0; bad=0 } $1>ts { cnt++; if ($2 != "11011") bad++ } END { if (cnt==60 && bad==0 ) print "ok" }' /tmp/linkstates) | |
if [[ "$fbok" ]]; then | |
$SUDO mv -f /tmp/linkstates.2 /tmp/linkstates.3 | |
$SUDO mv -f /tmp/linkstates.1 /tmp/linkstates.2 | |
$SUDO mv -f /tmp/linkstates /tmp/linkstates.1 | |
$SUDO rm -f /tmp/no_failback | |
failback; | |
post_failover | |
fi ;; | |
# if all is normal, do nothing | |
11100|11101|10100|10101) ;; | |
esac | |
fi | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment