Last active
May 30, 2025 16:11
-
-
Save spchamp/cced255372fa1ee88f15d92e3a576ad6 to your computer and use it in GitHub Desktop.
Shell script for ZFS zpool recovery using zdb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# zpool_recovery.sh | |
# | |
# usage: zpool_recovery.sh [-p pool] | |
# [-r dest_pool/recovery_path | -d dumpdir] | |
# [-t txg_id] [-i index_file] [-z xz_or_pixz_args ] | |
# [-x][-n][-b][-w][-q] | |
# | |
# default pool: tank | |
# default dumpdir: $PWD/dump/each | |
# default txg_id: Will be determined with zdb | |
# | |
# other args are descibed below | |
# | |
# recommended usage: | |
# | |
# # zfs set mountpoint=off dest_pool/dest_dataset | |
# # zpool_recovery.sh -p pool -r dest_pool/dest_dataset | |
# | |
# This assumes that the dest_pool will have sufficient | |
# storage space for the contents of all/most top-level | |
# filesystem and volume datasets from the recovered pool, | |
# absent of earlier snapshots. | |
# | |
# this script will try to use zdb -B to produce a zfs-send | |
# stream for each filesystem or volume in the pool denoted | |
# with '-p' | |
# | |
# If a dataset cannot be recovered from the top-level | |
# volume or filesystem dataset, this script will try | |
# to locate the most recent snapshot that can be recovered | |
# for the original dataset. Other snapshots will not be | |
# recovered. If a dataset does not have any earlier | |
# snapshots, this script will continue to the next | |
# recoverable dataset. | |
# | |
# | |
# Caveats: | |
# | |
# it's assumed that the origin pool has not changed | |
# across subsequent calls to this script | |
# | |
# for any dataset ID in which ${DUMPDIR}/${ID}.SKIP exists, | |
# the dataset will not be recovered, whether using the | |
# dumpdir or recovery_pool destination. The ID value would | |
# denote the object set ID for the filesystem, as stored | |
# in the index file. | |
# | |
# partly supported here: recovery for encrypted datasets, | |
# mainly in args for zfs-send(8)-style args for zdb -B | |
# | |
# if the original pool uses encryption, this script can | |
# be called with the -x arg to prevent the zdb -B call | |
# from using the -e arg like zfs-send, and in which case | |
# the -w arg will be used for zfs-send, assuming similar | |
# encryption for the receiving pool. (This has not been | |
# tested at this time) | |
# | |
# further documentation is provided in the following | |
set -e | |
## use set -x to echo script commands (debuging) | |
# set -x | |
# args | |
# | |
# [ -p POOLNAME ] : use POOLNAME as the origin pool, | |
# default is 'tank' | |
# | |
# [ -t TXG ] : assume TXG is the best txg value for recovery. | |
# default is determined with zdb, then stored | |
# in the file <DUMPDIR>/TXG | |
# | |
# [ -i FILENAME ] : filename for object set info from zdb, compressed | |
# with xz or pixz. default is <DUMPDIR>/INDEX.xz | |
# | |
# [ -x ] : assume pool is encrypted; use -w and not -e as with zfs send | |
# | |
# [ -n ] : do not use compression as with zfs send -c | |
# | |
# [ -b ] : do not use large block support as with zfs send -L | |
# | |
# [ -w ] : use xz, not pixz, even if pixz is installed | |
# | |
# [ -z ARGSTR ] : arg string (quoted) for xz / pixz | |
# | |
# [ -q ] : produce less output on stdout | |
# | |
# | |
# Persistence | |
# | |
# [ -d DUMPDIR ] : use DUMPDIR for persistent data. default: ${PWD}/dump | |
# | |
# This dumpdir will be used for some persistent data, even when a | |
# receiving pool is provided in args. | |
# | |
# Though not recommended, if no receiving pool is provided in args | |
# then the dumpdir will be used for storing all recovered object set | |
# streams, each compressed further with xz or pixz. The pathname for | |
# these stream files: <DUMPDIR>/each/<ID>.xz | |
# | |
# These can then be restored using xzcat and zfs receive onto some new | |
# pool. | |
# | |
# [ -r RECV_POOL/PATH ] : receiving pool and dataset path | |
# | |
# It's assumed by default that this receiving pool can receive | |
# streams as from 'zfs send -Lec'. Each of these send-style args | |
# can be disabled with individual args to this script, described | |
# in the previous, if required for the receiving pool. | |
# | |
# It's recommended to set 'mountpoint=none' on the receiving dataset, | |
# until the recovery process may be considered complete | |
# | |
# In all cases, additional files used by this script: | |
# | |
# % <DUMPDIR>/TXG : If exists, the file must contain only the | |
# transaction group ID (TXG) to use when dumping from the original | |
# pool. | |
# | |
# If this file does not exist, the TXG file will be created from the | |
# numerically largest usable TXG available for purposes of recovery | |
# with ZDB. | |
# | |
# If a -t TXG arg is provided, this value wil override any value | |
# stored in <DUMPDIR>/TXG | |
# | |
# % <DUMPDIR>/index.xz : If exists, the file should contain object set | |
# information as produced with zdb, compressed using pixz or xz. | |
# | |
# If the index file does not exist, the file will be created after | |
# parsing debug data from zdb | |
# | |
# % <DUMPDIR>/each/<ID>.SKIP | |
# | |
# For any file that exists with this syntax, the object set (i.e | |
# filesystem or volume) for the matching ID will not be dumped, | |
# whether to a receiving pool or to the "each" dir under the DUMPDIR. | |
# | |
# The corresponding dataset for each ID can be determined once the | |
# index.xz file is created, or by using a zdb command similar to that | |
# used when creating the index file in this script, e.g | |
# | |
# # zdb -d -AAAXe -t ${TXG} ${POOL} | less | |
# | |
# The contents of the each/<ID>.SKIP file will not be analyzed by this | |
# script. The file can be created generally with touch(1) with a path | |
# corresponding to the object set ID for any dataset that should not | |
# be recovered with this script. | |
# | |
# % Only when recovering to DUMPDIR: <DUMPDIR>/each/<ID>.xz | |
# | |
# When using the DUMPDIR recovery method, the existence of this | |
# file will be assumed to indicate that the corresponding ID | |
# has already been recovered. If the script is interrupted before | |
# the strean file is completely written, the partially recovered | |
# stream file should be manually deleted before any subsequent | |
# call to this script. As such, the output from this script may | |
# generally be stored in this case, such as with 'script -F' in order | |
# to be able to determine what TXG files may have been created before | |
# any interruption in the send (e.g interruption due to a corrupted | |
# dataset, or due to poweroff or system error) | |
# | |
# For partial sends using the -r RECVPOOL/PATH storage method, | |
# any dataset created on receipt of the partial send should be | |
# automatically deleted by ZFS, after the send is interrupted | |
# | |
# (this script does not use resumable sends) | |
# | |
# % Additional Notes and Known Limitations | |
# | |
# - this script provides limited handling for signals, e.g interrupt. | |
# | |
# this signal handling will generally be affected by the shell | |
# interpreter under which the script is run, e.g /bin/sh or GNU BASH | |
# | |
# - limited support for partial sends | |
# | |
# - partial sends should generally not persist in any receiving pool | |
# | |
# - if recovering to a dumpdir, partially sent files will not be | |
# automatically deleted and may result in incomplete recovery, if | |
# each incomplete dump file is not removed before any subsequent | |
# script run | |
# | |
# - The dumpdir support represented one earlier storage method used when | |
# develping this script. It's generally recommended to recover to a | |
# new zpool, rather than to store the individual send/receive streams | |
# to a dumpdir. | |
# | |
# - Unless not supported by the receiving pool, compression support | |
# and large block support should generally not be disabled when | |
# running this shell script | |
# | |
# - During zfs receive, a @--HEAD-- snapshot will be created for every | |
# successfully received dataset - except in the case of a dataset | |
# recovered from some earlier snapshot. These snapshots will then be | |
# used by this script, to determine whether each dataset was | |
# completely sent. These individual snapshots should not be deleted | |
# from the receiving pool until the recovery process may be considered | |
# complete. | |
# | |
# - This script may not be able to restore any non-inherited mount points | |
# or other ZFS properties that were in use for datasets in the original | |
# pool | |
# | |
# it's generally recommended to set mountpoint=none on the receiving | |
# ZFS dataset, at least until such time as when the original mountpoints | |
# can be restored to the recovered datasets | |
# | |
# - this script may not detect all incomplete sends, such as when a | |
# dataset could not be sent due to corrupted storage data | |
# | |
# one type of error message that may occur on stderr, during send: | |
# | |
# dump_backup: dmu_send_obj: Input/output error | |
# | |
# this error message may indicate a corrupted object set. | |
# | |
# if any snapshot exists for the corresponding dataset within the original | |
# pool, then any of those snapshots may still be usable for purpose of | |
# recovery. | |
# | |
# after detecting an incomplete send as such, this script will try to | |
# recover the corresponding dataset from the newest available snapshot | |
# for the same dataset | |
# | |
# additional information presented on stderr for this script may | |
# include lines such as the following, corresponding with an error | |
# condition such as above: | |
# | |
# #-- <scriptname> Failed: receive for <dataset_name> [<object_set_id>] | |
# | |
# - TXG values may be listed on a per-vdev basis with the command | |
# | |
# # zdb -ul -AAAXe /path/to/vdev | awk '$1 == "txg" { print $3 }' | |
# | |
# TXG IDs produced by ZDB may generally be listed in chronological | |
# order. This order can be reversed, for displaying any more recent TXG | |
# ID first, such as by piping the output of the above command to GNU | |
# tac (gtac, from GNU coreutils) or to `tail -r` on FreeBSD systems. | |
# This assumes that a numerically larger TXG ID may represent a more | |
# recent transaction group. | |
# | |
# Copyright (c) 2025 Sean Champ. All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions | |
# are met: | |
# | |
# 1. Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# | |
# 2. Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
# SUCH DAMAGE. | |
## try to detect the script name, which may not be available | |
## if this script is sourced via sh(1) or BASH | |
THIS=$(if THIS=$(readlink -f "$0" 2>/dev/null); then | |
basename ${THIS}; else echo "zpool_recovery.sh"; fi) | |
msg() { | |
echo "#-- ${THIS}: $@" 1>&2 | |
} | |
fail() { | |
# 'exit' within a subshell iterator may not exit the script as expected | |
msg "$@" | |
exit 1 | |
} | |
# | |
# args handling | |
# | |
POOL=tank | |
RECVDEST="" | |
DUMPDIR=dump | |
TXG="" | |
ENCRYPTED="" | |
COMPRESSION="c" | |
LARGEBLOCKS="L" | |
XZ="" | |
XZARGS="" | |
QUIET="" | |
while getopts ":p:t:d:i:xnwz:qr:" ARG; do | |
case ${ARG} in | |
p) | |
POOL="${OPTARG}" | |
;; | |
t) | |
TXG="${OPTARG}" | |
;; | |
d) | |
DUMPDIR="${OPTARG}" | |
;; | |
i) | |
INDEX="${OPTARG}" | |
;; | |
x) | |
ENCRYPTED="y" | |
;; | |
n) | |
COMPRESSION="" | |
;; | |
w) | |
XZ="xz" | |
;; | |
z) | |
XZARGS="${XZARGS}${XZARGS:+ }${OPTARG}" | |
;; | |
q) | |
QUIET="y" | |
;; | |
r) | |
RECVDEST="${OPTARG}" | |
;; | |
*) | |
fail "Unknown options in args: ${1}" | |
;; | |
esac | |
done | |
# | |
# further arg checks | |
# | |
case "${1:-}" in | |
-*|"") ;; | |
*) | |
fail "Uknown options in args: $*" | |
;; | |
esac | |
## not a necessary check, when using '-F' on receive | |
# case ${RECVPOOL} in | |
# ""|*/*) | |
# ;; | |
# *) | |
# fail "RECVPOOL must not denote the receiving pool name: ${RECVPOOL}" | |
# ;; | |
# esac | |
# | |
# deferred defaults for args | |
# | |
if [ "x${INDEX:-}" = "x" ]; then | |
INDEX="${DUMPDIR}/index.xz" | |
fi | |
if [ "x${XZ}" = "x" ]; then | |
if ! XZ=$(which pixz); then | |
XZ=$(which xz) || fail "xz not found" | |
fi | |
fi | |
SEND_ARGS="" | |
if [ "x${LARGEBLOCKS}" != "x" ]; then | |
SEND_ARGS="-L" | |
fi | |
if [ "x${COMPRESSION}" != "x" ]; then | |
SEND_ARGS="${SEND_ARGS}${SEND_ARGS:+ }-c" | |
fi | |
if [ "${ENCRYPTED}" = "y" ]; then | |
SEND_ARGS="${SEND_ARGS}${SEND_ARGS:+ }-w" | |
else | |
SEND_ARGS="${SEND_ARGS}${SEND_ARGS:+ }-e" | |
fi | |
EACHDIR="${DUMPDIR}/each" | |
# environment option: TAC | |
# | |
# If not set: | |
# - use GNU tac if available | |
# - else, assume FreeBSD or similar: use tail -r | |
# | |
# ${TAC} is used to reverse the order of zdb | |
# debugging info, whether chronologically or | |
# for the structure of the recovered pool | |
# | |
if [ "x${TAC}" = "x" ]; then | |
if ! ( TAC=$(which gtac 2>/dev/null) || | |
TAC=$(which tac 2>/dev/null) ); then | |
TAC="tail -r" | |
fi | |
fi | |
case "${QUIET}" in | |
y) | |
VERBOSE="" | |
;; | |
*) | |
VERBOSE="-v" | |
;; | |
esac | |
# | |
# main script | |
# | |
info() { | |
if [ "${QUIET}" != "y" ]; then | |
msg "$@" | |
fi | |
} | |
ensure_txg() { | |
local VDEV COUNT LTXG TMPV | |
# when TXG was provided in top-level script args | |
if [ "x${TXG}" != "x" ]; then return; fi | |
# when TXG was already stored under DUMPDIR | |
if [ -e "${DUMPDIR}/TXG" ]; then | |
TXG=$(cat "${DUMPDIR}/TXG") | |
info "Found existing TXG ${TXG}" | |
return | |
fi | |
# try finding a usable TXG using zdb -d for each usable vdev, | |
# with each vedev as determined via zdb -l | |
# first loop is to determine vdevs for the pool | |
zdb -AAAXe ${POOL} | | |
awk '$1 == "path:" { gsub("['\'']", "", $2); print $2 }' | | |
while read VDEV; do | |
info "trying vdev ${VDEV}"; | |
# the main driver for this loop is the following zdb call, | |
# producing TXG ID and TXG index (count) values from each | |
# vdev | |
zdb -ul -AAAXe ${VDEV} | | |
awk -v "COUNT=-1" '$1 ~ "^Uberblock" { | |
sub("Uberblock\\[", "", $1); | |
sub("\\]", "", $1); | |
COUNT=$1; | |
next; | |
} | |
COUNT > 0 && $1 == "txg" { | |
print COUNT " " $3 | |
}' | ${TAC} | while read COUNT LTXG ; do | |
# the next loop tries to determine the first usable TXG | |
info "Trying ${VDEV} uberblock nr. ${COUNT} txg ${LTXG}" | |
TMPV=$(zdb -d -t ${LTXG} -AAAXe ${POOL} | head -n1) | |
if [ "x${TMPV}" != "x" ]; then | |
info "Found txg [${COUNT}] ${LTXG}" | |
echo "${LTXG}" > "${DUMPDIR}/TXG" | |
TXG=${LTXG} | |
# this return call does not appear | |
# to actually return from the function. | |
# | |
# it returns from a subshell, at this | |
# point? | |
return | |
fi | |
done | |
done | |
# store the top-level TXG if now available | |
if [ -e "${DUMPDIR}/TXG" ]; then | |
TXG=$(head -n1 "${DUMPDIR}/TXG") | |
return | |
fi | |
# try a fallback .. | |
info "Using fallback detection for TXG" | |
TXG=$(zdb -AAAXe ${POOL} | | |
awk '/best uberblock/ { print $12 }' | | |
tail -n1) | |
} | |
ensure_index() { | |
if [ -e "${INDEX}" ]; then | |
info "Using existing index: ${INDEX}" | |
else | |
info "Generating index: ${INDEX}" | |
zdb -d -AAAXe -t ${TXG} ${POOL} | | |
awk -v "FS=[ ,]" '/^Dataset/ && $3 == "[ZPL]" { print $0 }' | | |
${TAC} | ${XZ} > ${INDEX} | |
fi | |
} | |
parse_index_ds() { | |
# parse the exiting index, filtering out any snapshots at this point | |
ensure_index | |
xzcat ${INDEX} | awk -v "FS=[ ,]" '/^Dataset/ && $2 !~ "@" && $3 == "[ZPL]" { | |
print $2 " " $6 }' | |
} | |
parse_index_snap() { | |
# second-level recovery support for intermediate snapshots | |
# | |
# this parses the object set index, separate from parse_index_ds | |
# | |
# when this is called, the index can be assumed to already exist | |
# | |
# optional second arg is a snapshot to skip, such as when | |
# the snapshot could not be completely sent | |
# | |
local DNAME="$1"; | |
shift || fail "script error: parse_index_snap called without dataset name" | |
local SKIP="${1:-}" | |
if [ "x${SKIP}" = "x" ]; then | |
# print the first (chronologically newest) snapshot | |
# for this dataset | |
xzcat ${INDEX} | | |
awk -v "FS=[ ,]" -v "DS=${DNAME}" \ | |
'$1 == "Dataset" && $2 ~ DS "@" && $3 == "[ZPL]" { print $2 " " $6; exit }' | |
else | |
# parse the reverse-ordered output from the zdb index, | |
# printing the first snapshot name in this ordering. This | |
# is assumed to be chronologically previous to the skipped | |
# snapshot | |
xzcat ${INDEX} | | |
awk -v "FS=[ ,]" -v "DS=${DNAME}" -v "SKIP=${SKIP}" \ | |
'BEGIN { SKIPPING = 1 } | |
$1 == "Dataset" && $2 == DS "@" SKIP && $3 == "[ZPL]" { | |
SKIPPING = 0; next; | |
} | |
$1 == "Dataset" && $2 ~ DS "@" && !SKIPPING && $3 == "[ZPL]" { | |
print $2 " " $6; exit }' | |
fi | |
} | |
handle_stream() { | |
local NAME=${1} | |
shift || fail "script error: handle_stream called without NAME arg" | |
local ID=${1} | |
shift || fail "script error: handle_stream called without ID arg" | |
local SNAP="${1:-}" | |
if [ -e "${EACHDIR}/${ID}.SKIP" ]; then | |
info "skip: ${ID}" | |
return 0 | |
fi | |
if [ "x${RECVDEST}" = "x" ]; then | |
if [ -e "${EACHDIR}/${ID}.xz" ]; then | |
info "Already recovered: ${ID}" | |
return 0 | |
fi | |
else | |
# detect any earlier recovery when ${RECVDEST} | |
# | |
# this handles the recvname in a syntax | |
# as with 'zfs receive -d' | |
# | |
local RECVNAME | |
if [ "${NAME}" = "${POOL}" ]; then | |
RECVNAME=${RECVDEST} | |
else | |
RECVNAME=${RECVDEST}/${NAME#*/} | |
fi | |
local RECV_OK=$(zfs list -t snapshot -pH -oname "${RECVNAME}" 2>/dev/null) | |
if [ "x${RECV_OK}" != "x" ]; then | |
info "Already recovered: ${ID} ${RECVNAME}" | |
return | |
fi | |
fi | |
zdb -eAAAX -t ${TXG} -B ${POOL}/${ID} ${SEND_ARGS} | | |
if [ "x${RECVDEST}" = "x" ]; then | |
info "recovering ${NAME} [${ID}] to ${EACHDIR}/${ID}.xz" | |
echo "${NAME}" > ${EACHDIR}/${ID}.name | |
if ! ${XZ} > ${EACHDIR}/${ID}.xz; then | |
# when storing to a DUMPDIR, this may detect | |
# some errors in filesystem I/O and may not | |
# detect any ZFS error with the sent stream | |
msg "Failed: receive for ${NAME} [${ID}]" | |
return 1 | |
fi | |
else | |
local RECV_HOW | |
## prototype, for if not using -d in zfs receive | |
# if [ "${NAME}" = "${POOL}" ]; then | |
# RECV_HOW=-e | |
# else | |
# RECV_HOW=-d | |
# fi | |
info "recovering ${NAME} [${ID}] onto ${RECVDEST}" | |
if ! zfs receive -dF ${RECV_HOW} ${VERBOSE} "${RECVDEST}"; then | |
# call recursively from here. values | |
# set here might not be reflected in | |
# the top-level scope of this function | |
handle_snap ${NAME} ${ID} ${SNAP} | |
else | |
## to remove the individual dump file | |
# rm -f ${EACHDIR}/${ID}.xz | |
## else | |
true | |
fi | |
fi | |
} | |
handle_snap() { | |
local NAME=${1} | |
shift || fail "script error: handle_snap called without NAME arg" | |
local ID=${1} | |
shift || fail "script error: handle_snap called without ID arg" | |
local SNAP=${1} | |
local NEXTSNAP NEXTID | |
parse_index_snap ${NAME} ${SNAP} | while read NEXTSNAP NEXTID; do | |
if [ "x${NEXTSNAP}" = "x" ]; then | |
info "No further snapshots: ${NAME}" | |
return | |
else | |
handle_stream ${NAME} ${NEXTID} ${NEXTSNAP} | |
fi | |
done || info "No further snapshots: ${NAME}" # parse_index_snap | |
} | |
# | |
# main() | |
# | |
## try to not call through to this when debugging via 'source' with BASH | |
if [ "x${BASH_SOURCE}" = "x" ] || [ "${BASH_SOURCE}" = "${0}" ]; then | |
mkdir -p "${DUMPDIR}" | |
mkdir -p "${EACHDIR}" | |
ensure_txg | |
info "using TXG: ${TXG}" | |
ensure_index | |
parse_index_ds | | |
while read NAME ID; do | |
handle_stream ${NAME} ${ID} | |
## try to make it interruptable (no signal handling here) | |
sleep 1 | |
done | |
fi # BASH_SOURCE test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment