Created
April 28, 2014 00:00
-
-
Save jab416171/11358501 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# � 2010 Western Digital Technologies, Inc. All rights reserved. | |
# | |
# monitorVolume.sh | |
# Note: this is called by cron | |
# | |
# | |
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin | |
. /usr/local/sbin/share-param.sh | |
. /etc/nas/alert-param.sh | |
. /etc/system.conf | |
. /etc/wdcomp.d/wd-nas/wd-nas.conf 2> /dev/null | |
MAX_USAGE_THRESH=95 | |
MIN_USAGE_THRESH=93 | |
# check DataVolume percent used | |
percentUsed=`getDataVolumePercentUsed.sh` | |
echo "% used=${percentUsed} MAX=${MAX_USAGE_THRESH}" | |
if [ -f /tmp/tst_freespace ] || [ "${percentUsed}" -gt "${MAX_USAGE_THRESH}" ]; then | |
if [ ! -f ${FREESPACE_STATUS_FILE} ]; then | |
sendAlert.sh "${diskNearCapacity}" | |
fi | |
if [ ! -f ${FREESPACE_STATUS_FILE} ]; then | |
touch ${FREESPACE_STATUS_FILE} | |
incUpdateCount.pm system_state | |
fi | |
else | |
if [ "${percentUsed}" -le "${MIN_USAGE_THRESH}" ]; then | |
if [ -f ${FREESPACE_STATUS_FILE} ]; then | |
rm -f ${FREESPACE_STATUS_FILE} | |
incUpdateCount.pm system_state | |
fi | |
fi | |
fi | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# (c) 2013 Western Digital Technologies, Inc. All rights reserved. | |
# | |
# monitorio - Monitor disk activity, and put system into standby. Also, monitor to trigger file tally process | |
## | |
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin | |
. /lib/lsb/init-functions | |
source /etc/priority.conf | |
source /etc/system.conf | |
source /usr/local/sbin/drive_helper.sh | |
source /etc/wdcomp.d/wd-nas/wd-nas.conf 2> /dev/null | |
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh | |
MIN_SINCE_DISK_ACCESS=/tmp/minutes_since_disk_access | |
TALLY_PIDFILE=/var/run/tally.pid | |
TALLY_DAEMON=/usr/local/bin/tally | |
TALLY_PIPE=/var/local/nas_file_tally/tallyd.pipe | |
MEDIACRAWLER_REWALK=/tmp/mediacrawler_rewalk | |
# Only give monitorio 20% of the CPU Max | |
MONITORIO_CPU_SHARE=20 | |
CGROUP_MONITORIO=/sys/fs/cgroup/monitorio | |
mkdir -p $CGROUP_MONITORIO | |
echo $MONITORIO_CPU_SHARE > $CGROUP_MONITORIO/cpu.shares | |
echo $$ > $CGROUP_MONITORIO/tasks | |
total_df_file=$WD_NAS_VAR_DIR/total_df | |
# trigger tally (or share size) when df result changes by TALLY_TRIGGER_THRESH_KB | |
TALLY_TRIGGER_THRESH_KB=1000000 | |
file_tally() { | |
if [ ! -p $TALLY_PIPE ]; then | |
mkfifo $TALLY_PIPE | |
fi | |
start-stop-daemon --start --quiet --oknodo --nicelevel $monitorio_nice --pidfile $TALLY_PIDFILE --make-pidfile --background --exec $TALLY_DAEMON -- | |
ls -s1NRA --block-size=1 /shares | awk ' | |
{ | |
if ($1 ~ /^[0-9]+$/) { | |
# printf("#4:%s:%s/%s\0\0\0\0",$1,current_dir,substr($0,index($0,$2))); | |
printf("#4:%s:%s/%s~~~~",$1,current_dir,substr($0,index($0,$2))); | |
} | |
else { | |
if ($1 != "total") { | |
current_dir = (substr($0,1,length($0)-1)); | |
} | |
} | |
} | |
END { | |
printf("#0:0:/tmp/TALLYEND.DONE~~~~"); | |
} | |
' > $TALLY_PIPE | |
# ' > /var/local/nas_file_tally/tallyd.txt | |
# cat /var/local/nas_file_tally/tallyd.txt > $TALLY_PIPE | |
} | |
wait_system_ready() { | |
while [ ! -f "/tmp/ready" ]; do | |
logger -s "$0: waiting for system to become ready.." | |
sleep 5 | |
done | |
} | |
tmp_share_size=/tmp/share_size | |
tmp_internal_share_size=/tmp/internal_share_size | |
tmp_external_share_size=/tmp/external_share_size | |
calculate_share_size() { | |
find /shares -maxdepth 1 -mindepth 1 -type d -not -name ".*" -print0 | xargs -0 -I {} getShareSize.sh {} > ${tmp_internal_share_size} | |
cat $tmp_internal_share_size $tmp_external_share_size > ${tmp_share_size} | |
} | |
calculate_external_share_size() { | |
find /shares -maxdepth 1 -mindepth 1 -type l -print0 | xargs -0 -I {} getShareSize.sh {} > ${tmp_external_share_size} | |
cat $tmp_internal_share_size $tmp_external_share_size > ${tmp_share_size} | |
} | |
checkDataTrigger() { | |
result="trigger" | |
if [ -f ${total_df_file} ]; then | |
total_df=`cat ${total_df_file}` | |
result=`df | grep /DataVolume | awk -v total_df=${total_df} -v thresh=${TALLY_TRIGGER_THRESH_KB} '{x=$3 - total_df; abs_x=(x >= 0) ? x : -x; if(abs_x >= thresh) printf("trigger")}'` | |
fi | |
if [ "$result" == "trigger" ]; then | |
df | grep /DataVolume | awk '{print $3}' > ${total_df_file} | |
fi | |
echo $result | |
} | |
mkdir -p `dirname ${SHARE_SIZE_CACHE}` | |
declare -i sleepcount | |
declare -i rootdisk_thresh | |
declare -i enterStandbyTime=0 | |
rm -f /tmp/standby | |
rm -f ${MEDIACRAWLER_REWALK} | |
source /etc/standby.conf | |
resetSleepCount() { | |
sleepcount=0 | |
# if in emergency run level, set standby threshold to 1 minute, since drive should go into standby as early as possible, otherwise, read config file | |
if [ "`getRunLevel.pl`" == "emergency" ]; then | |
standby_time=1 | |
rootdisk_thresh=1 | |
standby_enable="enabled" | |
else | |
source /etc/standby.conf | |
rootdisk_thresh=`expr $standby_time - 1` | |
fi | |
} | |
currentRootDevice=`cat /proc/cmdline | awk -F= 'BEGIN{RS=" "}{ if ($1=="root") print $2 }'` | |
rootDisk=`basename ${currentRootDevice}` | |
dataVolumeDisk=`basename ${dataVolumeDevice}` | |
drivelist=(`internalDrives`) | |
echo "0" > ${MIN_SINCE_DISK_ACCESS} | |
# wait for system to become ready | |
wait_system_ready | |
# run file tally at startup (in the background) | |
if [ ! -f $TALLY_DAEMON ]; then | |
logger "Tally daemon not installed, exiting tally function" | |
## if tally not present, then call calculate_share_size | |
calculate_share_size | |
calculate_external_share_size | |
rm ${SHARE_SIZE_CACHE} | |
ln -s ${tmp_share_size} ${SHARE_SIZE_CACHE} | |
else | |
file_tally & | |
fi | |
if [ "$1" == "debug" ]; then | |
echo "1" > /proc/sys/vm/block_dump | |
dmesg -c > /dev/null | |
fi | |
while :; do | |
for i in ${drivelist[@]}; do | |
hdparm -C $i | grep -q "standby" | |
standby_test=$? | |
[ "$standby_test" -eq "1" ] && break | |
done | |
if [ "$standby_test" -eq "0" ]; then | |
sleep 5 | |
continue | |
else | |
if [ -f /tmp/standby ]; then | |
standby_since=`stat --format %z /tmp/standby` | |
rm -f /tmp/standby | |
# Cancel blue color and turn on green if applicable | |
ledCtrl.sh LED_EV_DISK_STBY LED_STAT_OK | |
### This will allow individual components to register for wakupevents | |
run-parts /etc/nas/wakeup.d | |
### | |
touch ${MEDIACRAWLER_REWALK} | |
currentTime=`date +%s` | |
timeInStandby=`expr $currentTime - $enterStandbyTime` | |
echo "exit standby after $timeInStandby (since $standby_since)" | |
logger "exit standby after $timeInStandby (since $standby_since)" | |
if [ "$1" == "debug" ]; then | |
dmesg -c | |
fi | |
fi | |
resetSleepCount | |
echo $sleepcount > ${MIN_SINCE_DISK_ACCESS} | |
trigger_tally=0 | |
iow_root=`awk -v disk="${rootDisk}" '{if ($3==disk) print $10}' /proc/diskstats` | |
ior_datavol=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $6}' /proc/diskstats` | |
iow_datavol=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $10}' /proc/diskstats` | |
if [ "$1" == "debug" ]; then | |
echo "Init ior_datavol=$ior_datavol ior_datavol2=$ior_datavol2" | |
echo " iow_datavol=$iow_datavol iow_datavol2=$iow_datavol2" | |
echo " iow_root=$iow_root iow_root2=$iow_root2" | |
dmesg -c | |
fi | |
while :; do | |
# Wait for 60 seconds | |
sleep 60 | |
iow_root2=`awk -v disk="${rootDisk}" '{if ($3==disk) print $10}' /proc/diskstats` | |
ior_datavol2=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $6}' /proc/diskstats` | |
iow_datavol2=`awk -v disk="${dataVolumeDisk}" '{if ($3==disk) print $10}' /proc/diskstats` | |
# check for file tally sync | |
if [ "$iow_datavol" -ne "$iow_datavol2" ] && [ "`checkDataTrigger`" == "trigger" ]; then | |
incUpdateCount.pm data_volume_write | |
monitorFreeSpace.sh | |
if [ -f $TALLY_DAEMON ]; then | |
# also run tally if installed | |
pidofproc -p $TALLY_PIDFILE $TALLY_DAEMON >/dev/null | |
if [ $? -ne 0 ]; then | |
file_tally | |
fi | |
createBackupTally.sh | |
else | |
## if tally not present, then call calculate_share_size | |
calculate_share_size | |
fi | |
fi | |
# calculate size of external shares. Note that this must be done outside of "checkDataTrigger" so that it is done more often. | |
calculate_external_share_size | |
# use data volume writes until near sleep threshold, then check all disk writes | |
old_sleepcount=sleepcount | |
if [ $((sleepcount)) -eq $((rootdisk_thresh)) ] && [ "$iow_root" -eq "$iow_root2" ]; then | |
sleepcount=$((sleepcount+1)) | |
elif [ $((sleepcount)) -lt $((rootdisk_thresh)) ] && [ "$ior_datavol" -eq "$ior_datavol2" ] && [ "$iow_datavol" -eq "$iow_datavol2" ]; then | |
sleepcount=$((sleepcount+1)) | |
else | |
resetSleepCount | |
fi | |
echo $sleepcount > ${MIN_SINCE_DISK_ACCESS} | |
if [ "$1" == "debug" ]; then | |
[ "$sleepcount" != "0" ] && echo "sleepcount: $sleepcount" | |
[ "$sleepcount" == "0" ] && echo "Disk activity:" | |
echo "... ior_datavol=$ior_datavol ior_datavol2=$ior_datavol2" | |
echo "... iow_datavol=$iow_datavol iow_datavol2=$iow_datavol2" | |
echo "... iow_root=$iow_root iow_root2=$iow_root2" | |
# dmesg -c | |
fi | |
ior_datavol=$ior_datavol2 | |
iow_datavol=$iow_datavol2 | |
iow_root=$iow_root2 | |
smartTestStatus=`getSmartTestStatus.sh | awk '{print $1}'` | |
if [ "$standby_enable" == "enabled" ] && [ "$sleepcount" -eq "$standby_time" ] && [ "$smartTestStatus" != "inprogress" ]; then | |
touch /tmp/standby | |
enterStandbyTime=`date +%s` | |
echo "Enter standby" | |
if [ "$1" == "debug" ]; then | |
echo "`date`: Enter standby " | |
dmesg -c > /dev/null | |
fi | |
for i in ${drivelist[@]}; do | |
hdparm -y $i >/dev/null | |
done | |
# turn on solid blue if applicable | |
ledCtrl.sh LED_EV_DISK_STBY LED_STAT_IN_PROG | |
sleep 5 | |
break | |
fi | |
done | |
fi | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# � 2010 Western Digital Technologies, Inc. All rights reserved. | |
# | |
# monitorSmartStatus.sh | |
# Note: this is called by cron | |
# | |
# | |
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin | |
. /etc/system.conf | |
. /usr/local/sbin/share-param.sh | |
. /etc/nas/alert-param.sh | |
. /usr/local/sbin/drive_helper.sh | |
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh | |
# exit if in standby | |
if [ -f /tmp/standby ]; then | |
exit 0; | |
fi | |
# exit if system with no internal drives | |
if [ "${DVC_DRIVE_COUNT}" == "0" ]; then | |
exit 0 | |
fi | |
atLeastOneDriveFailed=FALSE | |
driveList=( `internalDrives` ) | |
for drive in "${driveList[@]}" | |
do | |
smartctl -d ata -H ${drive} | grep -q PASSED | |
if [ $? -ne 0 ]; then | |
atLeastOneDriveFailed=TRUE | |
fi | |
done | |
if [ "$atLeastOneDriveFailed" = "TRUE" ] || [ -f /tmp/tst_smart ]; then | |
if [ ! -f /tmp/smart_fail ]; then | |
sendAlert.sh "${driveSmartFail}" | |
ledCtrl.sh LED_EV_DISK_SMART LED_STAT_ERR | |
incUpdateCount.pm system_state | |
fi | |
touch /tmp/smart_fail | |
else | |
rm -f /tmp/smart_fail | |
fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# (c) 2012 Western Digital Technologies, Inc. All rights reserved. | |
# | |
# monitorTemperature.sh | |
# Note: This is called by init-script monitorTemperature | |
# | |
# This script is responsible to monitor temperature of internal drives | |
# and take actions if temperture is not normal | |
# | |
# It takes following actions depending on temperature of internal drives | |
# if temperature of any drive > TF | |
# - change led to RED | |
# - send shutdown alert | |
# - change run-level to emergency | |
# - exit | |
# | |
# if temperature of any drive between T2 & TF | |
# - send shutdown-warning alert | |
# - start a shutdown-warning timer of 1 HR | |
# - if timer expires change run-level to emergency | |
# - exit | |
# | |
# if temperature of any drive between T1 & T2 | |
# - send high-temperature warning alert | |
# - exit | |
# | |
# To restart all services & get back to normal | |
# if temperature of all drives <= T2 - Hysterisis | |
# - send normal temperature alert | |
# - change led to GREEN | |
# - change run-level to application | |
# - exit | |
# | |
## --- Includes | |
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin | |
source /usr/local/sbin/share-param.sh | |
source /etc/system.conf | |
source /etc/nas/alert-param.sh # ( for alerts ) | |
source /usr/local/sbin/drive_helper.sh # ( for internalDrives() ) | |
source /usr/local/sbin/wdStatus.sh # ( for $WDST_XXX status codes ) | |
source /etc/wdcomp.d/wd-nas/temperature-monitor.conf | |
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh | |
## --- Constants | |
## Acronyms | |
NM=${STATE_NORMAL} | |
WR=${STATE_WARNING} | |
SW=${STATE_SHUTDOWN_WARNING} | |
SI=${STATE_SHUTDOWN_IMMEDIATE} | |
UK=${STATE_UNKNOWN} | |
## Internal Constants | |
## NB: TURN OFF BEFORE CHECKING-IN | |
DEBUG=0 ## for debugging | |
## For Testing | |
## - set TEST=1 | |
## - set DEBUG=1 | |
## - disable the infinite "for MONITOR_TIMER" loop | |
## - enter different temperatures on input | |
## - to test with infinite "for MONITOR_TIMER" loop set different values for TEMP_TX | |
TEST=0 | |
## Logger facility | |
FAC=local2 | |
## Table of allowed actions based on last & curr state | |
## last | curr -- NM WR SW SI UK | |
## | | |
eval ACTION${NM}="( act_noop act_warning act_start_timer act_emergency act_noop )" | |
eval ACTION${WR}="( act_normal act_noop act_start_timer act_emergency act_noop )" | |
eval ACTION${SW}="( act_normal act_hysterisis act_check_timer act_emergency act_check_timer )" | |
eval ACTION${SI}="( act_restart act_hysterisis act_cooldown act_cooldown act_cooldown )" | |
eval ACTION${UK}="( act_UK_2_NM act_warning act_start_timer act_emergency act_noop )" ## should never be called as UK state is never saved | |
## --- Global Variables | |
drive_list= | |
last_state= | |
curr_state= | |
curr_temp= | |
## --- Functions | |
## Get the drive temperature | |
## | |
## Input: | |
## drive device (e.g. sda, sdb) | |
## | |
## Output: | |
## on success - drive temperature | |
## on failure - ""(empty) | |
## | |
## E.g. getDriveTemperature "/dev/sda" | |
getDriveTemperature() | |
{ | |
local drive | |
local temp | |
## pass arguments | |
drive=${1} | |
## get the drive temperature using smart | |
temp=`smartctl -d ata -A "${drive}" | \ | |
awk '{if ($2 == "Temperature_Celsius") print $10}'` | |
echo "${temp}" > "${SMART_STATE}" | |
## return not found if number is not returned | |
if ! [[ "${temp}" =~ ^[0-9]+$ ]]; then | |
logger -p ${FAC}.err "$0: Non-numeric drive temperature \"${temp}\" obtained" | |
return ${WDST_NOTFOUND} | |
fi | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Drive ${drive} temperature is ${temp}" | |
fi | |
## output the temperature | |
echo "${temp}" | |
return ${WDST_OK} | |
} | |
## Get the current temperature state | |
## | |
## Input: | |
## drive list (e.g. /dev/sda /dev/sdb) | |
## | |
## Output: | |
## on success - temperature state (e.g. NM, WR, SW, SI) | |
## on failure - ""(empty) | |
## | |
## E.g. determineCurrentState "/dev/sda /dev/sdb /dev/sdc" | |
determineCurrentState() | |
{ | |
local drive | |
local temp | |
local drive_temp | |
local drive_state | |
local prev_state | |
local transition | |
## allowed temperature states transitions across all drives | |
## NB: "TRANSITION" is treated as a 2-D array | |
## prev | next --- NM WR SW SI UK | |
## | | |
eval TRANSITION${NM}="( ${NM} ${WR} ${SW} ${SI} ${UK} )" | |
eval TRANSITION${WR}="( ${WR} ${WR} ${SW} ${SI} ${WR} )" | |
eval TRANSITION${SW}="( ${SW} ${SW} ${SW} ${SI} ${SW} )" | |
eval TRANSITION${SI}="( ${SI} ${SI} ${SI} ${SI} ${SI} )" | |
eval TRANSITION${UK}="( ${UK} ${WR} ${SW} ${SI} ${UK} )" | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Getting current temperature state" | |
fi | |
## init | |
drive_state=${NM} | |
drive_temp=0 | |
## loop through the drive list & finalize temperature state using | |
## TRANSITION table | |
for drive in ${drive_list[@]} | |
do | |
## save prev state & temp | |
prev_state=${drive_state} | |
## get the current drive temperature | |
temp=`getDriveTemperature "${drive}"` | |
## TEST ONLY | |
if [ ${TEST} -ne 0 ]; then | |
echo -n "Enter temperature: " | |
read temp | |
logger -p ${FAC}.debug "$0: INPUT temperature is ${temp}" | |
fi | |
## reset temperature to 0 if not defined | |
if [ $? -ne ${WDST_OK} ] || [ -z "${temp}" ]; then | |
temp=0 | |
fi | |
## NB: ${temp} is integer value | |
## determine the temperature state of this drive | |
if [ ${temp} -eq 0 ]; then | |
state=${UK} | |
elif [ ${temp} -le ${TEMP_T1} ]; then | |
state=${NM} | |
elif [ ${temp} -gt ${TEMP_T1} ] && [ ${temp} -le ${TEMP_T2} ]; then | |
state=${WR} | |
elif [ ${temp} -gt ${TEMP_T2} ] && [ ${temp} -le ${TEMP_TF} ]; then | |
state=${SW} | |
elif [ ${temp} -gt ${TEMP_TF} ]; then | |
state=${SI} | |
fi | |
## get the actual drive state using the TRANSITION table | |
transition=TRANSITION${prev_state}[${state}] | |
drive_state=${!transition} | |
## update drive temperature if state changes or temperature increases | |
if [ ${drive_temp} -eq 0 ] || [ ${drive_state} -ne ${prev_state} ] || [ ${drive_temp} -lt ${temp} ]; then | |
drive_temp=${temp} | |
fi | |
## optimization: break the loop if current state is SI (shutdown immediate) | |
if [ ${drive_state} -eq ${SI} ]; then break; fi | |
done | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Current Temperature - ${drive_temp}, Current State - ${drive_state}" | |
fi | |
## pass to global variables | |
curr_state=${drive_state} | |
curr_temp=${drive_temp} | |
return ${WDST_OK} | |
} | |
## --- Action Handlers | |
act_emergency() | |
{ | |
## NB: curr_state is always ${SI} in this action | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Shutdown, Temperature - ${curr_temp}" | |
fi | |
## update the temp state file | |
echo "${SI}" > "${TEMP_STATE}" | |
## set over temperature state | |
touch "${OVER_TEMP_FLAG}" | |
## change led color to red | |
ledCtrl.sh LED_EV_THERMO LED_STAT_ERR | |
## send over-temperature with immediate shutdown alert | |
sendAlert.sh "${thermalShutdownImmediate}" | |
## stop the timer | |
echo 0 > "${TEMP_SHUTDOWN_TIMER}" | |
## notify system for thermal state-change | |
incUpdateCount.pm ${THERMAL_STATE_NFY_ID} | |
## log emergency | |
logger -p ${FAC}.emerg "$0: Current temperature(${curr_temp}) is over max-threshold, stopping all services" | |
## change run-level to emergency | |
changeRunLevel.pl --level=emergency | |
return ${WDST_OK} | |
} | |
act_restart() | |
{ | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Restart, Temperature - ${curr_temp}" | |
fi | |
## reset over-temperature state | |
rm -f "${OVER_TEMP_FLAG}" | |
## change led color to green only if system health is good | |
ledCtrl.sh LED_EV_THERMO LED_STAT_OK | |
## send normal temperature alert | |
sendAlert.sh "${temperatureNormal}" | |
## update the temp state file | |
echo "${NM}" > "${TEMP_STATE}" | |
## notify system for thermal state-change | |
incUpdateCount.pm ${THERMAL_STATE_NFY_ID} | |
## log notice | |
logger -p ${FAC}.notice "$0: Temperature of all drives(${curr_temp}) is now normal, restarting all services" | |
## change run-level to application mode | |
changeRunLevel.pl --level=app | |
return ${WDST_OK} | |
} | |
act_cooldown() | |
{ | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Cooldown, Temperature - ${curr_temp}" | |
fi | |
## log notice | |
logger -p ${FAC}.notice "$0: Current temperature(${curr_temp}) is still hot, maintaining shutdown" | |
return ${WDST_OK} | |
} | |
act_noop() | |
{ | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action noop, Temperature - ${curr_temp}" | |
fi | |
## no action taken | |
return ${WDST_OK} | |
} | |
act_start_timer() | |
{ | |
## NB: curr_state is always ${SW} in this action | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Shutdown-Warning, Starting Timer, Temperature - ${curr_temp}" | |
fi | |
## update the temp state file | |
echo "${SW}" > "${TEMP_STATE}" | |
## change led color blinking yellow | |
ledCtrl.sh LED_EV_THERMO LED_STAT_WARN | |
## send over-temperature with pending shutdown alert | |
sendAlert.sh "${thermalShutdownPending}" | |
## start the shutdown timer | |
date -u +%s > "${TEMP_SHUTDOWN_TIMER}" | |
## notify system for thermal state-change | |
incUpdateCount.pm ${THERMAL_STATE_NFY_ID} | |
## log critical | |
logger -p ${FAC}.crit "$0: Over-Temperature condition(${curr_temp}), Shutdown-Warning, Timer started" | |
return ${WDST_OK} | |
} | |
act_check_timer() | |
{ | |
## NB: curr_state is always ${SW} in this action | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Shutdown-Warning, Checking Timer, Temperature - ${curr_temp}" | |
fi | |
## get start timer, should never be 0 | |
start_time=( `cat "${TEMP_SHUTDOWN_TIMER}"` ) | |
if [ ${start_time} -eq 0 ]; then return ${WDST_FAILED}; fi | |
## get current time | |
curr_time=( `date -u +%s` ) | |
## no action if timer has not expired | |
## NB: Temperature state shall remain SW | |
if [ $(( ${curr_time} - ${start_time} )) -le ${MAX_SW_TIME} ]; then | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Timer has not expired" | |
fi | |
return ${WDST_OK} | |
fi | |
## --- Timer has expired | |
## log critical | |
logger -p ${FAC}.notice "$0: Over-Temperature condition(${curr_temp}), Timer expired" | |
## stop the timer | |
echo 0 > "${TEMP_SHUTDOWN_TIMER}" | |
## reset disk-smart led event | |
ledCtrl.sh LED_EV_THERMO LED_STAT_OK | |
## modify state to SI | |
curr_state=${SI} | |
## initiate last_state->SI action | |
## execute the action based on last & current state | |
action_hdlr=ACTION${last_state}[${curr_state}] | |
${!action_hdlr} | |
return $? | |
} | |
act_warning() | |
{ | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Normal -> Warning, Temperature - ${curr_temp}" | |
fi | |
## update the temp state file | |
echo "${WR}" > "${TEMP_STATE}" | |
## send high-temperature warning | |
sendAlert.sh "${systemTemperatureHigh}" | |
## notify system for thermal state-change | |
incUpdateCount.pm ${THERMAL_STATE_NFY_ID} | |
## log | |
logger -p ${FAC}.warning "$0: High-Temperature(${curr_temp}) condition detected" | |
return ${WDST_OK} | |
} | |
act_normal() | |
{ | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Shutdown-Warning -> Normal, Temperature - ${curr_temp}" | |
fi | |
## stop the timer | |
echo 0 > "${TEMP_SHUTDOWN_TIMER}" | |
## reset disk-smart led event | |
ledCtrl.sh LED_EV_THERMO LED_STAT_OK | |
## send normal temperature alert | |
sendAlert.sh "${temperatureNormal}" | |
## update the temp state file | |
## NB: Update state "after" all actions when switching to normal are completed | |
echo "${NM}" > "${TEMP_STATE}" | |
## notify system for thermal state-change | |
incUpdateCount.pm ${THERMAL_STATE_NFY_ID} | |
## log notice | |
logger -p ${FAC}.notice "$0: Temperature of all drives(${curr_temp}) is now normal" | |
return ${WDST_OK} | |
} | |
act_hysterisis() | |
{ | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Hysterisis, Temperature - ${curr_temp}" | |
fi | |
## remain in last state if within hysterisis; otherwise update current state to normal | |
if [ ${curr_temp} -gt $(( ${TEMP_T2} - ${HYSTERISIS} )) ]; then | |
curr_state=${last_state} | |
else | |
curr_state=${NM} | |
fi | |
## execute the action based on last & current state | |
action_hdlr=ACTION${last_state}[${curr_state}] | |
${!action_hdlr} | |
return $? | |
} | |
act_UK_2_NM() | |
{ | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Action Unknown -> Normal, Temperature - ${curr_temp}" | |
fi | |
## NB: No need to notify system state-change as last state was unknown | |
## update the temp state file | |
echo "${NM}" > "${TEMP_STATE}" | |
return ${WDST_OK} | |
} | |
## --- Main script | |
{ | |
## exit if system has no internal drives | |
if [ "${DVC_DRIVE_COUNT}" == "0" ]; then | |
exit 0 | |
fi | |
## get list of drives | |
drive_list=( `internalDrives` ) | |
## exit if no drives are found | |
if [ -z "${drive_list}" ]; then | |
exit 0 | |
fi | |
logger -p ${FAC}.info "$0: Starting Temperature Monitor" | |
## init temp state to normal if not over temperature | |
if [ ! -f "${TEMP_STATE}" ]; then | |
echo "${NM}" > "${TEMP_STATE}" | |
if [ -f "${OVER_TEMP_FLAG}" ]; then | |
echo "${SI}" > "${TEMP_STATE}" | |
fi | |
fi | |
## init shutdown timer if not in shutdown-warning state | |
last_state=( `cat ${TEMP_STATE}` ) | |
if [ ${last_state} -ne ${SW} ]; then | |
echo 0 > "${TEMP_SHUTDOWN_TIMER}" | |
fi | |
## loop every MONITOR_TIMER seconds | |
## NB: disable loop for if TEST=1 | |
for (( ; ; `sleep ${MONITOR_TIMER}`)); do | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Starting Temperature Monitor Run" | |
fi | |
## get the last saved temperature state | |
## NB: This state was saved in an earlier run of this script | |
last_state=( `cat ${TEMP_STATE}` ) | |
if [ $? -ne ${WDST_OK} ] || [ -z "${last_state}" ] || ! [[ "${last_state}" =~ ^[0-9]+$ ]] || [ ${last_state} -ge ${N_STATES} ]; then | |
last_state=${NM} | |
## initialize the last state file | |
echo "${NM}" > "${TEMP_STATE}" | |
fi | |
## skip run if in standby & last state is normal | |
if [ -f "${STANDBY_STATE}" ] && [ ${last_state} -eq ${NM} ]; then | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Skipping run as system is in standby" | |
fi | |
continue | |
fi | |
## determine the current temperature state | |
## NB: This function shall set $curr_state & $curr_temp global vars | |
determineCurrentState | |
if [ $? -ne ${WDST_OK} ] || [ -z "${curr_state}" ]; then | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Failed to determine current state; continuing" | |
fi | |
continue | |
fi | |
## execute the action based on last & current state | |
## NB: Cannot execute ${array${last_state}[${curr_state}]} directly | |
## It must be saved in to a variable x & executed using ${!x} | |
action_hdlr=ACTION${last_state}[${curr_state}] | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Last state=${last_state}; action=`echo ${!action_hdlr}`" | |
fi | |
${!action_hdlr} | |
if [ $? -ne ${WDST_OK} ]; then | |
logger -p ${FAC}.err "$0: Failed to execute the action `echo ${!action_hdlr}`; last state=${last_state}; curr temp=${curr_temp}" | |
continue | |
fi | |
## debug | |
if [ ${DEBUG} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Finished Temperature Monitor Run" | |
fi | |
## break if TEST | |
if [ ${TEST} -ne 0 ]; then | |
logger -p ${FAC}.debug "$0: Finished Test Run" | |
break | |
fi | |
done ## for MONITOR_TIMER infinite loop | |
logger -p ${FAC}.info "$0: Finished Temperature Monitor" | |
exit 0 | |
} | |
## --- End of Main script |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# � 2010 Western Digital Technologies, Inc. All rights reserved. | |
# | |
# monitorVolume.sh | |
# Note: this is called by cron | |
# | |
# | |
PATH=/sbin:/bin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin | |
. /usr/local/sbin/share-param.sh | |
. /etc/nas/alert-param.sh | |
. /etc/system.conf | |
. /etc/wdcomp.d/wd-nas/wd-nas.conf 2> /dev/null | |
[ -f /usr/local/sbin/ledConfig.sh ] && . /usr/local/sbin/ledConfig.sh | |
lockFile="/tmp/monitorVolume" | |
# exit if in standby, or factory restore in progress | |
if [ -f /tmp/standby ] || [ -f ${reformatDataVolume} ]; then | |
exit 0; | |
fi | |
# exit if system with no internal drives | |
if [ "${DVC_DRIVE_COUNT}" == "0" ]; then | |
exit 0 | |
fi | |
# exit if already another instance of script is in progress | |
lockfile-create --retry 0 "${lockFile}" >/dev/null 2>&1 | |
if [ $? -ne 0 ]; then | |
exit 0 | |
fi | |
# If script were to take longer than 5 minutes | |
lockfile-touch ${lockFile} & | |
pid="$!" | |
df | grep -q ${dataVolumeDevice} | |
if [ $? -ne 0 ] || [ -f /tmp/tst_volume ]; then | |
if [ ! -f /tmp/volume_failed ]; then | |
sendAlert.sh "${volumeFailure}" | |
incUpdateCount.pm system_state | |
fi | |
ledCtrl.sh LED_EV_VOLUME LED_STAT_ERR | |
touch /tmp/volume_failed | |
# clean up mutual exclusion | |
kill "${pid}" >/dev/null 2>&1 | |
lockfile-remove ${lockFile} >/dev/null 2>&1 | |
exit 0 | |
else | |
rm -f /tmp/volume_failed | |
fi | |
# clean up mutual exclusion | |
kill "${pid}" >/dev/null 2>&1 | |
lockfile-remove ${lockFile} >/dev/null 2>&1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment