Created
November 13, 2016 21:25
-
-
Save rnwolf/87545373bf1294c14c481b9d8c72bc8c to your computer and use it in GitHub Desktop.
Bash helper script to help with the automation of routine snapRAID tasks. Add it as a cron job to automate the routine tasks of running sync and scrub. Script will optionally send you email of status and issues encountered during the run.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
####################################################################### | |
# This is a helper script that keeps snapraid parity info in sync with | |
# your data and optionally verifies the parity info. Here's how it works: | |
# 1) Checks for size 0 .nzb files (plex) | |
# 2) Calls diff to figure out if the parity info is out of sync. | |
# 3) If parity info is out of sync, AND the number of deleted or changed files exceed | |
# X (each configurable), it triggers an alert email and stops. (In case of | |
# accidental deletions, you have the opportunity to recover them from | |
# the existing parity info. This also mitigates to a degree encryption malware.) | |
# 4) If partiy info is out of sync, AND the number of deleted or changed files exceed X | |
# AND it has reached/exceeded Y (configurable) number of warnings, force | |
# a sync. (Useful when you get a false alarm above and you can't be bothered | |
# to login and do a manual sync. Note the risk is if its not a false alarm | |
# and you can't access the box before Y number of times the job is run to | |
# fix the issue... Well I hope you have other backups...) | |
# 5) If parity info is out of sync BUT the number of deleted files did NOT | |
# exceed X, it calls sync to update the parity info. | |
# 6) If the parity info is in sync (either because nothing changed or after it | |
# has successfully completed the sync job, it runs the scrub command to | |
# validate the integrity of the data (both the files and the parity info). | |
# Note that each run of the scrub command will validate only a (configurable) | |
# portion of parity info to avoid having a long running job and affecting | |
# the performance of the box. | |
# 7) Once all jobs are completed, it sends an email with the output to user | |
# (if configured). | |
# | |
# | |
# Inspired by Zack Reed (http://zackreed.me/articles/83-updated-snapraid-sync-script) | |
# | |
####################################################################### | |
# REQUIRES: | |
# - mailx (simplify sending HTML emails) | |
# - python markdown (render Markdown to HTML) | |
###################### | |
# USER VARIABLES # | |
###################### | |
####################### USER CONFIGURATION START ####################### | |
# address where the output of the jobs will be emailed to. | |
# comment it out to disable email output | |
EMAIL_ADDRESS="root" | |
# Set the threshold of deleted files to stop the sync job from running. | |
# NOTE that depending on how active your filesystem is being used, a low | |
# number here may result in your parity info being out of sync often and/or | |
# you having to do lots of manual sync. | |
DEL_THRESHOLD=100 | |
UP_THRESHOLD=500 | |
# Set number of warnings before we force a sync job. | |
# This option comes in handy when you cannot be bothered to manually | |
# start a sync job when DEL_THRESHOLD is breached due to false alarm. | |
# Set to 0 to ALWAYS force a sync (i.e. ignore the delete threshold above) | |
# Set to -1 to NEVER force a sync (i.e. need to manual sync if delete threshold is breached) | |
#SYNC_WARN_THRESHOLD=3 | |
SYNC_WARN_THRESHOLD=-1 | |
# Set percentage of array to scrub if it is in sync. | |
# i.e. 0 to disable and 100 to scrub the full array in one go | |
# WARNING - depending on size of your array, setting to 100 will take a very long time! | |
SCRUB_PERCENT=10 | |
SCRUB_AGE=10 | |
# Set the option to log SMART info. 1 to enable, any other values to disable | |
SMART_LOG=1 | |
# location of the snapraid binary | |
SNAPRAID_BIN="/usr/bin/snapraid" | |
# location of the mail program binary | |
MAIL_BIN="/usr/bin/mailx" | |
##### USER CONFIGURATION STOP ##### MAKE NO CHANGES BELOW THIS LINE #### | |
function main(){ | |
###################### | |
# INIT VARIABLES # | |
###################### | |
CHK_FAIL=0 | |
DO_SYNC=0 | |
EMAIL_SUBJECT_PREFIX="(SnapRAID on `hostname`)" | |
GRACEFUL=0 | |
SOPHOS_RUNNING=0 | |
SYNC_WARN_FILE="/tmp/snapRAID.warnCount" | |
SYNC_WARN_COUNT="" | |
TMP_OUTPUT="/tmp/snapRAID.out" | |
# Capture time | |
SECONDS=0 | |
# Expand PATH for smartctl | |
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin | |
# auto determine names of content and parity files | |
CONTENT_FILE=`grep -v '^$\|^\s*\#' /etc/snapraid.conf | grep snapraid.content | head -n 1 | cut -d " " -f2` | |
PARITY_FILE=`grep -v '^$\|^\s*\#' /etc/snapraid.conf | grep snapraid.parity | head -n 1 | cut -d " " -f2` | |
# redirect all output to screen and file | |
> $TMP_OUTPUT | |
exec 3>&1 4>&2 | |
# NOTE: Not preferred format but valid: exec &> >(tee -ia "${TMP_OUTPUT}" ) | |
exec > >(tee -ia "${TMP_OUTPUT}" ) 2>&1 | |
# timestamp the job | |
echo "SnapRAID Script Job started [`date`]" | |
echo | |
echo "----------------------------------------" | |
# Remove any plex created anomolies | |
echo "##Preprocessing" | |
# Stop any services that may inhibit optimum execution | |
echo "###Stop Services [`date`]" | |
stop_services | |
echo "###Remove Zero Byte NFOs [`date`]" | |
echo "Removing any 0 byte .nfo's before SnapRAID exeuction." | |
find /mnt/volume/media -name '*.nfo' -type f -size 0 -print0 | xargs -0 /bin/rm -f | |
# sanity check first to make sure we can access the content and parity files | |
if [ ! -e $CONTENT_FILE ]; then | |
echo "**ERROR** Content file ($CONTENT_FILE) not found!" | |
exit 1; | |
fi | |
if [ ! -e $PARITY_FILE ]; then | |
echo "**ERROR** Parity file ($PARITY_FILE) not found!" | |
exit 1; | |
fi | |
echo | |
echo "----------------------------------------" | |
echo "##Processing" | |
# Fix timestamps | |
chk_zero | |
# run the snapraid DIFF command | |
echo "###SnapRAID DIFF [`date`]" | |
$SNAPRAID_BIN diff | |
# wait for the above cmd to finish | |
wait | |
echo | |
echo "DIFF finished [`date`]" | |
JOBS_DONE="DIFF" | |
DEL_COUNT=$(grep -w '^ \{1,\}[0-9]* removed$' $TMP_OUTPUT | sed 's/^ *//g' | cut -d ' ' -f1) | |
ADD_COUNT=$(grep -w '^ \{1,\}[0-9]* added$' $TMP_OUTPUT | sed 's/^ *//g' | cut -d ' ' -f1) | |
MOVE_COUNT=$(grep -w '^ \{1,\}[0-9]* moved$' $TMP_OUTPUT | sed 's/^ *//g' | cut -d ' ' -f1) | |
COPY_COUNT=$(grep -w '^ \{1,\}[0-9]* copied$' $TMP_OUTPUT | sed 's/^ *//g' | cut -d ' ' -f1) | |
UPDATE_COUNT=$(grep -w '^ \{1,\}[0-9]* updated$' $TMP_OUTPUT | sed 's/^ *//g' | cut -d ' ' -f1) | |
# sanity check to make sure that we were able to get our counts from the output of the DIFF job | |
if [ -z "$DEL_COUNT" -o -z "$ADD_COUNT" -o -z "$MOVE_COUNT" -o -z "$COPY_COUNT" -o -z "$UPDATE_COUNT" ]; then | |
# failed to get one or more of the count values, lets report to user and exit with error code | |
echo "**ERROR** - failed to get one or more count values. Unable to proceed." | |
echo "Exiting script. [`date`]" | |
if [ $EMAIL_ADDRESS ]; then | |
SUBJECT="$EMAIL_SUBJECT_PREFIX WARNING - Unable to proceed with SYNC/SCRUB job(s). Check DIFF job output." | |
send_mail | |
fi | |
exit 1; | |
fi | |
echo | |
echo "**SUMMARY of changes - Added [$ADD_COUNT] - Deleted [$DEL_COUNT] - Moved [$MOVE_COUNT] - Copied [$COPY_COUNT] - Updated [$UPDATE_COUNT]**" | |
echo | |
# check if the conditions to run SYNC are met | |
# CHK 1 - if files have changed | |
if [ $DEL_COUNT -gt 0 -o $ADD_COUNT -gt 0 -o $MOVE_COUNT -gt 0 -o $COPY_COUNT -gt 0 -o $UPDATE_COUNT -gt 0 ]; then | |
chk_del | |
if [ $CHK_FAIL -eq 0 ]; then | |
chk_updated | |
fi | |
if [ $CHK_FAIL -eq 1 ]; then | |
chk_sync_warn | |
fi | |
else | |
# NO, so let's skip SYNC | |
echo "No change detected. Not running SYNC job. [`date`] " | |
DO_SYNC=0 | |
fi | |
# Now run sync if conditions are met | |
if [ $DO_SYNC -eq 1 ]; then | |
echo "###SnapRAID SYNC [`date`]" | |
$SNAPRAID_BIN sync -q | |
#wait for the job to finish | |
wait | |
echo "SYNC finished [`date`]" | |
JOBS_DONE="$JOBS_DONE + SYNC" | |
# insert SYNC marker to 'Everything OK' or 'Nothing to do' string to differentiate it from SCRUB job later | |
sed_me "s/^Everything OK/SYNC_JOB--Everything OK/g;s/^Nothing to do/SYNC_JOB--Nothing to do/g" "$TMP_OUTPUT" | |
# Remove any warning flags if set previously. This is done in this step to take care of scenarios when user | |
# has manually synced or restored deleted files and we will have missed it in the checks above. | |
if [ -e $SYNC_WARN_FILE ]; then | |
rm $SYNC_WARN_FILE | |
fi | |
echo | |
echo "SnapRAID SCRUB *Newly Added*" | |
$SNAPRAID_BIN scrub -p new -q | |
wait | |
echo "SCRUB *Newly Added* finished [`date`]" | |
echo | |
fi | |
# Moving onto scrub now. Check if user has enabled scrub | |
if [ $SCRUB_PERCENT -gt 0 ]; then | |
# YES, first let's check if delete threshold has been breached and we have not forced a sync. | |
if [ $CHK_FAIL -eq 1 -a $DO_SYNC -eq 0 ]; then | |
# YES, parity is out of sync so let's not run scrub job | |
echo "Scrub job cancelled as parity info is out of sync (deleted or changed files threshold has been breached). [`date`]" | |
else | |
# NO, delete threshold has not been breached OR we forced a sync, but we have one last test - | |
# let's make sure if sync ran, it completed successfully (by checking for our marker text "SYNC_JOB--" in the output). | |
if [ $DO_SYNC -eq 1 -a -z "$(grep -w "SYNC_JOB-" $TMP_OUTPUT)" ]; then | |
# Sync ran but did not complete successfully so lets not run scrub to be safe | |
echo "**WARNING** - check output of SYNC job. Could not detect marker <SYNC_JOB-->. Not proceeding with SCRUB job. [`date`]" | |
else | |
# Everything ok - let's run the scrub job! | |
echo "###SnapRAID SCRUB [`date`]" | |
$SNAPRAID_BIN scrub -p $SCRUB_PERCENT -o $SCRUB_AGE -q | |
#wait for the job to finish | |
wait | |
echo "SCRUB finished [`date`]" | |
echo | |
JOBS_DONE="$JOBS_DONE + SCRUB" | |
# insert SCRUB marker to 'Everything OK' or 'Nothing to do' string to differentiate it from SYNC job above | |
sed_me "s/^Everything OK/SCRUB_JOB--Everything OK/g;s/^Nothing to do/SCRUB_JOB--Nothing to do/g" "$TMP_OUTPUT" | |
fi | |
fi | |
else | |
echo "Scrub job is not enabled. Not running SCRUB job. [`date`] " | |
fi | |
echo | |
echo "----------------------------------------" | |
echo "##Postprocessing" | |
# Moving onto logging SMART info if enabled | |
if [ $SMART_LOG -eq 1 ]; then | |
echo | |
$SNAPRAID_BIN smart | |
wait | |
fi | |
echo "Spinning down disks..." | |
$SNAPRAID_BIN down | |
# Graceful restore of services outside of trap - for messaging | |
GRACEFUL=1 | |
restore_services | |
echo "All jobs ended. [`date`] " | |
# all jobs done, let's send output to user if configured | |
if [ $EMAIL_ADDRESS ]; then | |
echo -e "Email address is set. Sending email report to **$EMAIL_ADDRESS** [`date`]" | |
# check if deleted count exceeded threshold | |
if [ $CHK_FAIL -eq 1 ]; then | |
if [ $DEL_COUNT -gt $DEL_THRESHOLD -a $DO_SYNC -eq 0 ]; then | |
MSG="Deleted Files ($DEL_COUNT) / ($DEL_THRESHOLD) Violation" | |
fi | |
if [ $DEL_COUNT -gt $DEL_THRESHOLD -a $UPDATE_COUNT -gt $UP_THRESHOLD -a $DO_SYNC -eq 0 ]; then | |
MSG="$MSG & " | |
fi | |
if [ $UPDATE_COUNT -gt $UP_THRESHOLD -a $DO_SYNC -eq 0 ]; then | |
MSG="$MSG Changed Files ($UPDATE_COUNT) / ($UP_THRESHOLD) Violation" | |
fi | |
SUBJECT="[WARNING] $SYNC_WARN_COUNT - ($MSG) $EMAIL_SUBJECT_PREFIX" | |
elif [ -z "${JOBS_DONE##*"SYNC"*}" -a -z "$(grep -w "SYNC_JOB-" $TMP_OUTPUT)" ]; then | |
# Sync ran but did not complete successfully so lets warn the user | |
SUBJECT="[WARNING] SYNC job ran but did not complete successfully $EMAIL_SUBJECT_PREFIX" | |
elif [ -z "${JOBS_DONE##*"SCRUB"*}" -a -z "$(grep -w "SCRUB_JOB-" $TMP_OUTPUT)" ]; then | |
# Scrub ran but did not complete successfully so lets warn the user | |
SUBJECT="[WARNING] SCRUB job ran but did not complete successfully $EMAIL_SUBJECT_PREFIX" | |
else | |
SUBJECT="[COMPLETED] $JOBS_DONE Jobs $EMAIL_SUBJECT_PREFIX" | |
fi | |
ELAPSED="$(($SECONDS / 3600))hrs $((($SECONDS / 60) % 60))min $(($SECONDS % 60))sec" | |
echo | |
echo "----------------------------------------" | |
echo "##Total time elapsed for SnapRAID: $ELAPSED" | |
# Add a topline to email body | |
sed_me "1s/^/##$SUBJECT \n/" "${TMP_OUTPUT}" | |
send_mail | |
fi | |
clean_desc | |
exit 0; | |
} | |
####################### | |
# FUNCTIONS & METHODS # | |
####################### | |
function sed_me(){ | |
# The various redirects used force us to be a bit hands-on with sed | |
exec 1>&3 2>&4 3>&- 4>&- | |
$(sed -i "$1" "$2") | |
exec 3>&1 4>&2 | |
exec > >(tee -ia "${TMP_OUTPUT}" ) 2>&1 | |
wait | |
} | |
function chk_del(){ | |
if [ $DEL_COUNT -lt $DEL_THRESHOLD ]; then | |
# NO, delete threshold not reached, lets run the sync job | |
echo "Changes detected [A-$ADD_COUNT,D-$DEL_COUNT,M-$MOVE_COUNT,C-$COPY_COUNT,U-$UPDATE_COUNT] and deleted files ($DEL_COUNT) is below threshold ($DEL_THRESHOLD). SYNC Authorized." | |
DO_SYNC=1 | |
else | |
echo "**WARNING** Deleted files ($DEL_COUNT) exceeded threshold ($DEL_THRESHOLD)." | |
CHK_FAIL=1 | |
fi | |
} | |
function chk_updated(){ | |
if [ $UPDATE_COUNT -lt $UP_THRESHOLD ]; then | |
echo "Changes detected [A-$ADD_COUNT,D-$DEL_COUNT,M-$MOVE_COUNT,C-$COPY_COUNT,U-$UPDATE_COUNT] and updated files ($UPDATE_COUNT) is below threshold ($UP_THRESHOLD). SYNC Authorized." | |
DO_SYNC=1 | |
else | |
echo "**WARNING** Updated files ($UPDATE_COUNT) exceeded threshold ($UP_THRESHOLD)." | |
CHK_FAIL=1 | |
fi | |
} | |
function chk_sync_warn(){ | |
if [ $SYNC_WARN_THRESHOLD -gt -1 ]; then | |
echo "Forced sync is enabled. [`date`]" | |
SYNC_WARN_COUNT=$(sed 'q;/^[0-9][0-9]*$/!d' $SYNC_WARN_FILE 2>/dev/null) | |
SYNC_WARN_COUNT=${SYNC_WARN_COUNT:-0} #value is zero if file does not exist or does not contain what we are expecting | |
if [ $SYNC_WARN_COUNT -ge $SYNC_WARN_THRESHOLD ]; then | |
# YES, lets force a sync job. Do not need to remove warning marker here as it is automatically removed when the sync job is run by this script | |
echo "Number of warning(s) ($SYNC_WARN_COUNT) has reached/exceeded threshold ($SYNC_WARN_THRESHOLD). Forcing a SYNC job to run. [`date`]" | |
DO_SYNC=1 | |
else | |
# NO, so let's increment the warning count and skip the sync job | |
((SYNC_WARN_COUNT += 1)) | |
echo $SYNC_WARN_COUNT > $SYNC_WARN_FILE | |
echo "$((SYNC_WARN_THRESHOLD - SYNC_WARN_COUNT)) warning(s) till forced sync. NOT proceeding with SYNC job. [`date`]" | |
DO_SYNC=0 | |
fi | |
else | |
# NO, so let's skip SYNC | |
echo "Forced sync is not enabled. Check $TMP_OUTPUT for details. NOT proceeding with SYNC job. [`date`]" | |
DO_SYNC=0 | |
fi | |
} | |
function chk_zero(){ | |
echo "###SnapRAID TOUCH [`date`]" | |
echo "Checking for zero sub-second files." | |
TIMESTATUS=$($SNAPRAID_BIN status | grep 'You have [1-9][0-9]* files with zero sub-second timestamp\.' | sed 's/^You have/Found/g') | |
if [ -n "$TIMESTATUS" ]; then | |
echo "$TIMESTATUS" | |
echo "Running TOUCH job to timestamp. [`date`]" | |
$SNAPRAID_BIN touch | |
wait | |
echo "TOUCH finished [`date`]" | |
else | |
echo "No zero sub-second timestamp files found." | |
fi | |
} | |
function stop_services(){ | |
# Disable Sophos on-access | |
if /opt/sophos-av/bin/savdstatus | grep -v "not running" > /dev/null; then | |
echo "Sophos on-access detected, shutting it down..." | |
/opt/sophos-av/bin/savdctl disable | |
wait | |
SOPHOS_RUNNING=1 | |
fi | |
# Systemctl example | |
# Be sure to add an associated SERVICE_A_RUNNING=0 INIT VARIABLES above | |
#if [ `systemctl is-active service-A.service` == "active" ]; then | |
# echo "SERVICE_A detected, shutting it down..." | |
# systemctl stop service-A.service | |
# SERVICE_A_RUNNING=1 | |
#fi | |
} | |
function restore_services(){ | |
echo | |
# Enable Sophos on-access scanning | |
if [ $SOPHOS_RUNNING -eq 1 ]; then | |
echo "Restoring Sohpos on-access to active state..." | |
/opt/sophos-av/bin/savdctl enable | |
wait | |
SOPHOS_RUNNING=0 | |
fi | |
# Systemctl example | |
#if [ $SERVICE_A_RUNNING -eq 1 ]; then | |
# systemctl start service-A.service | |
# echo "Restoring SERVICE_A to active state..." | |
# SERVICE_A_RUNNING=0 | |
#fi | |
if [ $GRACEFUL -eq 1 ]; then | |
return | |
fi | |
clean_desc | |
exit | |
} | |
function clean_desc(){ | |
# Cleanup file descriptors | |
exec 1>&3 2>&4 | |
# If interactive shell restore output | |
[[ $- == *i* ]] && exec &>/dev/tty | |
} | |
function send_mail(){ | |
# Format for markdown | |
sed_me "s/$/ /" "$TMP_OUTPUT" | |
$MAIL_BIN -a 'Content-Type: text/html' -s "$SUBJECT" "$EMAIL_ADDRESS" < <(python -m markdown $TMP_OUTPUT) | |
} | |
# Set TRAP | |
trap restore_services INT EXIT | |
main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment