Created
April 19, 2015 21:38
-
-
Save jpluimers/2bb7c1df199ed19f0331 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/sh | |
# $Id$ | |
# This script should be run as a cron job on a regular interval. It will | |
# perform several system checks such as available disk space, free physical | |
# RAM check, and check logical drives' status/state. | |
MEGACLI="/opt/sbin/MegaCli64" | |
ARCCONF="/opt/sbin/arcconf" | |
MEM_PRCT_THRESH=0.90 | |
DISK_PRCT_THRESH=0.80 | |
CHK_FS_LABEL="esos_root" | |
EMAIL_TO="root" | |
EMAIL_FROM="root" | |
TMP_PATH="/tmp" | |
# Check MegaRAID logical drives (if any) | |
if [ -x "${MEGACLI}" ]; then | |
echo "Starting MegaRAID health checks..." | |
# Get the number of adapters on the system | |
# Adapter numbers start with 0 for MegaRAID | |
adp_count=`${MEGACLI} -adpCount -NoLog | grep "Controller Count:" | \ | |
cut -d: -f2 | tr -d ' ' | tr -d '.' | tr -d '\n'` | |
echo "Number of adapters found: ${adp_count}" | |
for adapter in `seq 0 $(expr ${adp_count} - 1)`; do | |
# Get the number of logical drives for the adapter | |
# Logical drive numbers start with 0 for MegaRAID | |
echo "Checking logical drives..." | |
ld_count=`${MEGACLI} -AdpAllInfo -a${adapter} -NoLog | \ | |
grep "Virtual Drives" | cut -d: -f2 | tr -d ' ' | tr -d '\n'` | |
if [ ${ld_count} -gt 0 ]; then | |
echo "Adapter ${adapter} has ${ld_count} logical drive(s)." | |
ld_list=`${MEGACLI} -LDInfo -Lall -a${adapter} -NoLog | \ | |
grep "Virtual Drive:" | sed 's/CacheCade //g' | cut -d" " -f3` | |
for logical_drv in ${ld_list}; do | |
ld_state=`${MEGACLI} -LDInfo -L${logical_drv} -a${adapter} -NoLog | \ | |
grep "State" | cut -d: -f2 | tr -d ' ' | tr -d '\n'` | |
if [ "${ld_state}" != "Optimal" ]; then | |
echo "** Warning! MegaRAID logical drive ${logical_drv} on" \ | |
"adapter ${adapter} is not optimal!" 1>&2 | |
echo "** Logical drive state: ${ld_state}" 1>&2 | |
fi | |
done | |
fi | |
# Check the physical drives for the adapter | |
echo "Checking physical drives..." | |
pd_count=0 | |
SAVED_IFS=${IFS} | |
IFS=$(echo -en "\n\b") | |
for i in `${MEGACLI} -PDList -a${adapter} -NoLog`; do | |
if echo "${i}" | grep "Firmware state:" > /dev/null 2>&1; then | |
pd_count=$(expr ${pd_count} + 1) | |
drv_state=`echo "${i}" | cut -d: -f2 | sed 's/^ *//' | tr -d '\n'` | |
if [ "${drv_state}" != "Unconfigured(good), Spun Up" ] && | |
[ "${drv_state}" != "Online, Spun Up" ] && | |
[ "${drv_state}" != "Hotspare, Spun Up" ]; then | |
echo "** Warning! It appears a MegaRAID physical drive has" \ | |
"failed on adapter ${adapter}!" 1>&2 | |
echo "** Physical drive state: ${drv_state}" 1>&2 | |
fi | |
fi | |
done | |
IFS=${SAVED_IFS} | |
echo "Checked ${pd_count} physical drive(s)." | |
# Check the status of attached enclosures | |
echo "Checking enclosures..." | |
SAVED_IFS=${IFS} | |
IFS=$(echo -en "\n\b") | |
for i in `${MEGACLI} -EncStatus -a${adapter} -NoLog`; do | |
if echo "${i}" | grep "Slot Status" > /dev/null 2>&1 || | |
echo "${i}" | grep "Power Supply Status" > /dev/null 2>&1 || | |
echo "${i}" | grep "Fan Status" > /dev/null 2>&1 || | |
echo "${i}" | grep "Temperature Sensor Status" > /dev/null 2>&1 || | |
echo "${i}" | grep "SIM Module Status" > /dev/null 2>&1; then | |
line_name=`echo "${i}" | cut -d: -f1 | sed 's/ *$//' | tr -d '\n'` | |
line_status=`echo "${i}" | cut -d: -f2 | sed 's/^ *//' | tr -d '\n'` | |
if [ "${line_status}" != "OK" ] && | |
[ "${line_status}" != "Not Installed" ] && | |
[ "${line_status}" != "Unknown" ] && | |
[ "${line_status}" != "Unsupported" ] && | |
[ "${line_status}" != "Not Available" ]; then | |
echo "** Warning! An enclosure health/status issue" \ | |
"has been detected on adapter ${adapter}!" 1>&2 | |
echo "** ${line_name} -> ${line_status}" 1>&2 | |
fi | |
fi | |
done | |
IFS=${SAVED_IFS} | |
done | |
else | |
echo "It appears the '${MEGACLI}' tool is not installed, or at least" | |
echo "is not executable. Skipping MegaRAID logical drive checks..." | |
fi | |
# Check AACRAID logical drives (if any) | |
if [ -x "${ARCCONF}" ]; then | |
echo "Starting AACRAID health checks..." | |
# Get the number of adapters on the system | |
# Adapter numbers start with 1 for AACRAID | |
adp_count=`${ARCCONF} GETVERSION nologs | grep "Controllers found:" | \ | |
cut -d: -f2 | tr -d ' ' | tr -d '\n'` | |
echo "Number of adapters found: ${adp_count}" | |
for adapter in `seq 1 ${adp_count}`; do | |
# Get the number of logical drives for the adapter | |
# Logical drive numbers start with 0 for AACRAID | |
ld_count=`${ARCCONF} GETCONFIG ${adapter} AD nologs | \ | |
grep "Logical devices/Failed/Degraded" | \ | |
cut -d: -f2 | cut -d/ -f1 | tr -d ' ' | tr -d '\n'` | |
if [ ${ld_count} -gt 0 ]; then | |
echo "Adapter ${adapter} has ${ld_count} logical drive(s)." | |
ld_list=`${ARCCONF} GETCONFIG ${adapter} LD nologs | \ | |
grep "Logical device number" | cut -d" " -f4` | |
for logical_drv in ${ld_list}; do | |
ld_state=`${ARCCONF} GETCONFIG ${adapter} LD ${logical_drv} nologs | \ | |
grep "Status of logical device" | cut -d: -f2 | \ | |
tr -d ' ' | tr -d '\n'` | |
if [ "${ld_state}" != "Optimal" ]; then | |
echo "** Warning! AACRAID logical drive ${logical_drv} on" \ | |
"adapter ${adapter} is not optimal!" 1>&2 | |
echo "** Logical drive state: ${ld_state}" 1>&2 | |
fi | |
done | |
fi | |
# Check the physical drives for the adapter | |
echo "Checking physical drives..." | |
pd_count=0 | |
SAVED_IFS=${IFS} | |
IFS=$(echo -en "\n\b") | |
for i in `${ARCCONF} GETCONFIG ${adapter} PD nologs`; do | |
if echo "${i}" | grep "State :" > /dev/null 2>&1; then | |
pd_count=$(expr ${pd_count} + 1) | |
drv_state=`echo "${i}" | cut -d: -f2 | sed 's/^ *//' | tr -d '\n'` | |
if [ "${drv_state}" != "Online" ] && | |
[ "${drv_state}" != "Ready" ] && | |
[ "${drv_state}" != "Hot Spare" ]; then | |
echo "** Warning! It appears a AACRAID physical drive has" \ | |
"failed on adapter ${adapter}!" 1>&2 | |
echo "** Physical drive state: ${drv_state}" 1>&2 | |
fi | |
fi | |
done | |
IFS=${SAVED_IFS} | |
echo "Checked ${pd_count} physical drive(s)." | |
# TODO: Need to implement enclosure status check. | |
done | |
else | |
echo "It appears the '${ARCCONF}' tool is not installed, or at least" | |
echo "is not executable. Skipping AACRAID logical drive checks..." | |
fi | |
# Check physical RAM | |
mem_total=`cat /proc/meminfo | grep "^MemTotal:" | awk '{print $2}'` | |
mem_avail=`cat /proc/meminfo | grep "^MemAvailable:" | awk '{print $2}'` | |
mem_used=`expr ${mem_total} - ${mem_avail}` | |
echo "Physical RAM check..." | |
echo -e "Total Memory:\t\t${mem_total} kB\nUsed Memory:\t\t${mem_used} kB\nAvailable Memory:\t${mem_avail} kB" | |
prct_mem_used=`echo "${mem_used} ${mem_total}" | awk '{ printf("%.1g", $1 / $2) }'` | |
echo -e "Memory used percent:\t${prct_mem_used}" | |
if expr ${prct_mem_used} '>' ${MEM_PRCT_THRESH} > /dev/null; then | |
echo "** Warning! Maximum memory used threshold (${MEM_PRCT_THRESH}) has been exceeded..." 1>&2 | |
echo "Total Physical RAM: ${mem_total} kB" 1>&2 | |
echo "Available Physical RAM: ${mem_avail} kB" 1>&2 | |
fi | |
# Check disk space (well, tmpfs root FS space) | |
disk_total=`df -m / | grep tmpfs | awk '{print $2}'` | |
disk_used=`df -m / | grep tmpfs | awk '{print $3}'` | |
disk_avail=`df -m / | grep tmpfs | awk '{print $4}'` | |
echo "Disk (/ -> root tmpfs) space check..." | |
echo -e "Total Disk Space:\t${disk_total} MB\nUsed Disk Space:\t${disk_used} MB\nAvail. Disk Space:\t${disk_avail} MB" | |
prct_disk_used=`echo "${disk_used} ${disk_total}" | awk '{ printf("%.1g", $1 / $2) }'` | |
echo -e "Disk used percent:\t${prct_disk_used}" | |
if expr ${prct_disk_used} '>' ${DISK_PRCT_THRESH} > /dev/null; then | |
echo "** Warning! Maximum disk space used threshold (${DISK_PRCT_THRESH}) has been exceeded..." 1>&2 | |
echo "Total Disk Space: ${disk_total} MB" 1>&2 | |
echo "Avail. Disk Space: ${disk_avail} MB" 1>&2 | |
fi | |
# Check if the USB drive is available/working via one of the FS labels (no indentation for if statement) | |
if ! findfs LABEL=${CHK_FS_LABEL} > /dev/null 2>&1; then | |
# Create a archive of the configuration files | |
arch_pkg_file="`hostname`-esos_conf-`date +%s`.tgz" | |
arch_pkg_path="${TMP_PATH}/${arch_pkg_file}" | |
tar cpfz ${arch_pkg_path} --exclude='rc.d' --exclude='ssh' --exclude='shadow*' /etc > /dev/null 2>&1 | |
# Send an email with the archive file attachment (uuencode'd) | |
sendmail -t << _EOF_ | |
To: ${EMAIL_TO} | |
From: ${EMAIL_FROM} | |
Subject: ESOS USB Flash Drive Failure - `hostname` (`date`) | |
A possible USB flash drive failure has been detected on Enterprise Storage OS host "`hostname`". | |
The findfs utility exited non-zero when attempting to resolve file system label "${CHK_FS_LABEL}". This may be due to a failed ESOS USB flash drive, or because the device was removed, or some other reason. | |
We're attaching a tar ball archive of the ESOS configuration files for this host just incase. | |
`uuencode ${arch_pkg_path} ${arch_pkg_file}` | |
_EOF_ | |
rm -f ${arch_pkg_path} | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment