Last active
January 19, 2018 19:37
-
-
Save erincerys/ed2099df4ec52ea886fc6a3d1a5ea989 to your computer and use it in GitHub Desktop.
Script to be remotely invoked by nagios that returns a service's status and can trigger an alarm when down or failed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## Checks a systemd controlled service for status and start failures | |
SVC_NAME='servicename' | |
COUNT_FILE='/tmp/checkservice.counter' | |
# load counter from previous checks | |
[ -f $COUNT_FILE ] && { COUNT=`cat $COUNT_FILE` ; } || { COUNT=0 ; } | |
SVC_STATUS=`systemctl is-failed $SVC_NAME` | |
RET=$? | |
# Service failed to start | |
if [[ $RET -eq 0 && "$SVC_STATUS" == 'failed' ]] || [[ $RET -eq 1 && "$SVC_STATUS" == 'activating' ]] ; then | |
MESSAGE='Service restarting or has failed to restart' | |
SELF_RET=1 | |
# Service intentionally stopped or crashed | |
elif [[ $RET -eq 1 && $(echo $SVC_STATUS | grep -cE '(unknown|inactive)') -eq 1 ]] ; then | |
MESSAGE='Service crashed or intentionally stopped' | |
SELF_RET=1 | |
elif [[ $RET -eq 1 && "${SVC_STATUS}" == 'active' ]] ; then | |
MESSAGE='Service is running' | |
SELF_RET=0 | |
[ -f $COUNT_FILE ] && { rm $COUNT_FILE ; } | |
echo "OK - $MESSAGE" | |
exit $SELF_RET | |
fi | |
# Get the number of failed starts | |
# TODO: This isn't released yet, i don't think | |
# https://github.com/systemd/systemd/pull/6495 | |
# SVC_NRESTARTS=`systemctl show $SVC_NAME -p NRestarts` | |
SVC_NRESTARTS=$COUNT | |
if [ "${SVC_NRESTARTS}" ] ; then | |
if [ $SVC_NRESTARTS -ge 10 ] ; then | |
ALARM_STATUS='CRITICAL' | |
elif [ $SVC_NRESTARTS -lt 10 ] ; then | |
ALARM_STATUS='WARNING' | |
echo $[$COUNT + 1] > $COUNT_FILE | |
fi | |
# elif [[ ! "${SVC_NRESTARTS}" && "${SVC_STATUS}" == 'active' ]] ; then | |
# ALARM_STATUS='OK' | |
# elif [[ ! "${SVC_NRESTARTS}" && $(echo $SVC_STATUS | grep -cE '(unknown|inactive)') -eq 1 ]] ; then | |
# ALARM_STATUS='WARNING' | |
fi | |
# Return problem info and exit code | |
echo "$ALARM_STATUS - $MESSAGE" | |
exit $SELF_RET |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment