-
-
Save petervanderdoes/bd6660302404ed5b094d to your computer and use it in GitHub Desktop.
#! /usr/local/bin/bash | |
# | |
# Calomel.org | |
# https://calomel.org/zfs_health_check_script.html | |
# FreeBSD 9.1 ZFS Health Check script | |
# zfs_health.sh @ Version 0.15 | |
# Check health of ZFS volumes and drives. On any faults send email. In FreeBSD | |
# 10 there is supposed to be a ZFSd daemon to monitor the health of the ZFS | |
# pools. For now, in FreeBSD 9, we will make our own checks and run this script | |
# through cron a few times a day. | |
# Changelog | |
# Peter van der Does - Always send an email, even if there is no problem. | |
# I prefer to know a script has run even when there is no problem. | |
# June 24, 2015 | |
# Peter van der Does - When a scrub is needed the email subject line only has to inform us once. | |
# 99 problems but ZFS ain't one | |
problems=0 | |
emailSubject="`hostname` - ZFS pool - HEALTH check" | |
emailMessage="" | |
# Health - Check if all zfs volumes are in good condition. We are looking for | |
# any keyword signifying a degraded or broken array. | |
condition=$(/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)') | |
if [ "${condition}" ]; then | |
emailSubject="$emailSubject - fault" | |
problems=1 | |
fi | |
# Capacity - Make sure pool capacities are below 80% for best performance. The | |
# percentage really depends on how large your volume is. If you have a 128GB | |
# SSD then 80% is reasonable. If you have a 60TB raid-z2 array then you can | |
# probably set the warning closer to 95%. | |
# | |
# ZFS uses a copy-on-write scheme. The file system writes new data to | |
# sequential free blocks first and when the uberblock has been updated the new | |
# inode pointers become valid. This method is true only when the pool has | |
# enough free sequential blocks. If the pool is at capacity and space limited, | |
# ZFS will be have to randomly write blocks. This means ZFS can not create an | |
# optimal set of sequential writes and write performance is severely impacted. | |
maxCapacity=80 | |
if [ ${problems} -eq 0 ]; then | |
capacity=$(/sbin/zpool list -H -o capacity) | |
for line in ${capacity//%/} | |
do | |
if [ $line -ge $maxCapacity ]; then | |
emailSubject="$emailSubject - Capacity Exceeded" | |
problems=1 | |
fi | |
done | |
fi | |
# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors | |
# on all volumes and all drives using "zpool status". If any non-zero errors | |
# are reported an email will be sent out. You should then look to replace the | |
# faulty drive and run "zpool scrub" on the affected volume after resilvering. | |
if [ ${problems} -eq 0 ]; then | |
errors=$(/sbin/zpool status | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000) | |
if [ "${errors}" ]; then | |
emailSubject="$emailSubject - Drive Errors" | |
problems=1 | |
fi | |
fi | |
# Scrub Expired - Check if all volumes have been scrubbed in at least the last | |
# 8 days. The general guide is to scrub volumes on desktop quality drives once | |
# a week and volumes on enterprise class drives once a month. You can always | |
# use cron to schedule "zpool scrub" in off hours. We scrub our volumes every | |
# Sunday morning for example. | |
# | |
# Scrubbing traverses all the data in the pool once and verifies all blocks can | |
# be read. Scrubbing proceeds as fast as the devices allows, though the | |
# priority of any I/O remains below that of normal calls. This operation might | |
# negatively impact performance, but the file system will remain usable and | |
# responsive while scrubbing occurs. To initiate an explicit scrub, use the | |
# "zpool scrub" command. | |
# | |
# The scrubExpire variable is in seconds. So for 8 days we calculate 8 days | |
# times 24 hours times 3600 seconds to equal 691200 seconds. | |
scrubExpire=691200 | |
if [ ${problems} -eq 0 ]; then | |
currentDate=$(date +%s) | |
zfsVolumes=$(/sbin/zpool list -H -o name) | |
for volume in ${zfsVolumes} | |
do | |
if [ $(/sbin/zpool status $volume | egrep -c "none requested") -ge 1 ]; then | |
echo "ERROR: You need to run \"zpool scrub $volume\" before this script can monitor the scrub expiration time." | |
break | |
fi | |
if [ $(/sbin/zpool status $volume | egrep -c "scrub in progress|resilver") -ge 1 ]; then | |
break | |
fi | |
### FreeBSD with *nix supported date format | |
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $15 $12 $13}') | |
scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s) | |
### Ubuntu with GNU supported date format | |
#scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}') | |
#scrubDate=$(date -d "$scrubRawDate" +%s) | |
if [ $(($currentDate - $scrubDate)) -ge $scrubExpire ]; then | |
if [ ${problems} -eq 0 ]; then | |
emailSubject="$emailSubject - Scrub Time Expired. Scrub Needed on Volume(s)" | |
fi | |
problems=1 | |
emailMessage="${emailMessage}Pool: $volume needs scrub \n" | |
fi | |
done | |
fi | |
# Notifications - On any problems send email with drive status information and | |
# capacities including a helpful subject line to root. Also use logger to write | |
# the email subject to the local logs. This is the place you may want to put | |
# any other notifications like: | |
# | |
# + Update an anonymous twitter account with your ZFS status (https://twitter.com/zfsmonitor) | |
# + Playing a sound file or beep the internal speaker | |
# + Update Nagios, Cacti, Zabbix, Munin or even BigBrother | |
echo -e "$emailMessage \n\n\n `/sbin/zpool list` \n\n\n `/sbin/zpool status`" | mail -s "$emailSubject" root | |
if [ "$problems" -ne 0 ]; then | |
logger $emailSubject | |
fi | |
### EOF ### |
@rusty725 try either setting a root alias to your external address in /etc/aliases
or make sure you have something like ssmtp setup and you can just change line https://gist.github.com/petervanderdoes/bd6660302404ed5b094d#file-zfs_health-sh-L134 root
-> [email protected]
Script is no longer working on FreeBSD 12. Executing it gives the following error:
/usr/local/sbin/zfs_health.sh
Failed conversion of 23onSun-000000'' using format
%Y%b%e-%H%M%S''
date: illegal time format
usage: date [-jnRu] [-d dst] [-r seconds|file] [-t west] [-v[+|-]val[ymwdHMS]]
[-I[date | hours | minutes | seconds]]
[-f fmt date | [[[[[cc]yy]mm]dd]HH]MM[.ss]] [+format]
/usr/local/sbin/zfs_health.sh: arithmetic expression: expecting primary: "1548512489 - "
I'm having the same problem on FreeBSD 12, did you find a fix?
root@mainserver:/home/donald # ./zfs_health.sh Failed conversion of ``30onSat-000000'' using format ``%Y%b%e-%H%M%S'' date: illegal time format usage: date [-jnRu] [-d dst] [-r seconds|file] [-t west] [-v[+|-]val[ymwdHMS]] [-I[date | hours | minutes | seconds]] [-f fmt date | [[[[[cc]yy]mm]dd]HH]MM[.ss]] [+format] ./zfs_health.sh: arithmetic expression: expecting primary: "1554116760 - " root@mainserver:/home/donald #
EDIT:
Never mind, found updated script with new code for FreeBSD 12 in it:
` ### Ubuntu with GNU supported date format
#scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $11" "$12" " $13" " $14" "$15}')
#scrubDate=$(date -d "$scrubRawDate" +%s)
### FreeBSD 11.2 with *nix supported date format
#scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $15 $12 $13}')
#scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
### FreeBSD 12.0 with *nix supported date format
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $17 $14 $15}')
scrubDate=$(date -j -f '%Y%b%e-%H%M%S' $scrubRawDate'-000000' +%s)
`
On Ubuntu / Debian you can fix date parsing errors by simply incrementing the awk parameters by two:
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $13" "$14" " $15" " $16" "$17}')
On Ubuntu / Debian you can fix date parsing errors by simply incrementing the awk parameters by two:
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $13" "$14" " $15" " $16" "$17}')
Found this out myself too! See my gist at: https://gist.github.com/woftor/ae6a3185e6689e9f92740aed9b010dec
Also has rudimentary logging to a file (you van disable it if you want)
This one is based on the newest version (0.18) found on calomel.org
Thx a lot for this great script!
Thanks for the script, had to remove -i
from https://gist.github.com/petervanderdoes/bd6660302404ed5b094d#file-zfs_health-sh-L27 to avoid old zfs version being reported as unhealthy (Some supported features are not enabled on the pool. The pool can still be used, but some features are unavailable.).
The date parsing on Ubuntu is unreliable if you mix pools which take more than one day to scrub with pools that don't, or if you have pools which sometimes (but not always) take more than one day to scrub. This is because the zpool status
command will selectively include or omit the X days
tokens, throwing off the counts used in the awk
command that follows.
We can make this a little more robust by having awk parse from the end going backwards, rather than from the front going forwards:
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $(NF-4)" "$(NF-3)" " $(NF-2)" " $(NF-1)" "$(NF)}')
The date parsing on Ubuntu is unreliable if you mix pools which take more than one day to scrub with pools that don't, or if you have pools which sometimes (but not always) take more than one day to scrub. This is because the
zpool status
command will selectively include or omit theX days
tokens, throwing off the counts used in theawk
command that follows.We can make this a little more robust by having awk parse from the end going backwards, rather than from the front going forwards:
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $(NF-4)" "$(NF-3)" " $(NF-2)" " $(NF-1)" "$(NF)}')
Very nice, that's worked on my Proxmox setup. but in case someone need to just be more specific can also use
scrubRawDate=$(/sbin/zpool status $volume | grep scrub | awk '{print $6" "$7" " $8" " $9" "$10}')
Is there any reason why someone do not use zpool status -x instead of https://gist.github.com/petervanderdoes/bd6660302404ed5b094d#file-zfs_health-sh-L27 ?
Reproducing the catch
Example 1:
zpool status -v
pool: rpool
state: ONLINE
scan: resilvered 498G in 00:29:20 with 0 errors on Tue Sep 10 13:24:04 2024
remove: Removal of vdev 1 copied 7.73M in 0h0m, completed on Tue Sep 10 12:36:45 2024
1.83K memory used for removed device mappings
config:
NAME STATE READ WRITE CKSUM
rpool ONLINE 0 0 0
mirror-0 ONLINE 0 0 0
wwn-0x50026b7686bba165-part3 ONLINE 0 0 0
wwn-0x50026b7686bba1fb-part3 ONLINE 0 0 0
errors: No known data errors
but the script will run
/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)'
1.83K memory used for removed device mappings
Example 2:
# zpool status -v
pool: rpool
state: ONLINE
scan: scrub in progress since Thu Sep 12 08:22:01 2024
374G scanned at 161M/s, 340G issued at 146M/s, 703G total
0B repaired, 48.36% done, 00:42:27 to go
config:
NAME STATE READ WRITE CKSUM
rpool ONLINE 0 0 0
mirror-0 ONLINE 0 0 0
scsi-3500003983803294d-part3 ONLINE 0 0 0
scsi-350000398281094c5-part3 ONLINE 0 0 0
errors: No known data errors
# /sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)'
~$: #
almost the same results but with just a word catch, because REMOVED is included in extended grep . Notice that DEGRADED and deGraDed or degraded is the same and CAPS or case-s is not respected (at least in Linux eco-system)
conclusion :
Instead of trying to catch possible issues, it would be better to let zpool do it's job by just parse zpool status -x
# zpool status -x
all pools are healthy
If your host got issue will get :
$: # zpool status -x
pool: rpool
state: DEGRADED
status: One or more devices could not be used because the label is missing or
invalid. Sufficient replicas exist for the pool to continue
functioning in a degraded state.
action: Replace the device using 'zpool replace'.
see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-4J
scan: scrub repaired 0B in 01:57:24 with 0 errors on Sun Sep 8 02:21:26 2024
config:
NAME STATE READ WRITE CKSUM
rpool DEGRADED 0 0 0
mirror-0 DEGRADED 0 0 0
scsi-350000397280ade51-part3 ONLINE 0 0 0
pool: rpool
10842156768423579076 UNAVAIL 0 0 0 was /dev/disk/by-id/scsi-35000039838090a49- state: DEGRADED
part3
errors: No known data errorsstatus: One or more devices could not be used because the label is missing or
invalid. Sufficient replicas exist for the pool to continue
functioning in a degraded state.
action: Replace the device using 'zpool replace'.
see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-4J
scan: scrub repaired 0B in 01:57:24 with 0 errors on Sun Sep 8 02:21:26 2024
config:
NAME STATE READ WRITE CKSUM
rpool DEGRADED 0 0 0
mirror-0 DEGRADED 0 0 0
scsi-350000397280ade51-part3 ONLINE 0 0 0
10842156768423579076 UNAVAIL 0 0 0 was /dev/disk/by-id/scsi-35000039838090a49-part3
Is there any reason why someone do not use zpool status -x instead of
I use -X
. Is there something like that for Checksum errors. Yeah I'll Google it also :)
how to make this script send emails to external addresses?