Skip to content

Instantly share code, notes, and snippets.

@svandragt
Last active December 7, 2025 09:42
Show Gist options
  • Select an option

  • Save svandragt/71d1040d1d76a143e1ecf6a0d302798e to your computer and use it in GitHub Desktop.

Select an option

Save svandragt/71d1040d1d76a143e1ecf6a0d302798e to your computer and use it in GitHub Desktop.
archive.sh — Heuristic Zstandard tar archiver
#!/usr/bin/env bash
# archive.sh — Heuristic Zstandard tar archiver with optional PAR2 redundancy
# - Decides compression strategy based on directory size
# - Appends trailing slash if needed
# - Produces a uniquely named .tar.zst archive
# - Optionally adds PAR2 recovery files for large datasets
#
# Usage: ./archive.sh <directory>
#
# Env overrides:
# ZSTD_LEVEL Compression level (default: 19)
# SMALL_GB Size threshold for "small" dataset (default: 1 GiB)
# MEDIUM_GB Size threshold for "medium" dataset (default: 4 GiB)
# PAR_REDUNDANCY % redundancy for PAR2 (default: 10)
#
# Heuristic:
# < 1 GiB -> threads=1, no --long
# 1–<4 GiB -> threads≈half cores, no --long
# >= 4 GiB -> threads=all cores, --long=30
set -euo pipefail
# -------- Config (overridable via env) ---------------------------------------
ZSTD_LEVEL="${ZSTD_LEVEL:-19}"
SMALL_GB="${SMALL_GB:-1}"
MEDIUM_GB="${MEDIUM_GB:-4}"
PAR_REDUNDANCY="${PAR_REDUNDANCY:-10}"
# -------- Helpers ------------------------------------------------------------
usage() {
echo "Usage: $0 <directory>" >&2
exit 1
}
need_cmd() {
command -v "$1" >/dev/null 2>&1 || { echo "Missing dependency: $1" >&2; exit 127; }
}
bytes_from_gib() {
local gib="$1"
echo $(( gib * 1024 * 1024 * 1024 ))
}
# -------- Preconditions ------------------------------------------------------
need_cmd tar
need_cmd zstd
need_cmd du
need_cmd nproc
need_cmd awk
need_cmd date
need_cmd readlink
[[ $# -eq 1 ]] || usage
INPUT_DIR="$1"
[[ -d "$INPUT_DIR" ]] || { echo "Not a directory: $INPUT_DIR" >&2; exit 2; }
# Normalize directory with trailing slash
case "$INPUT_DIR" in
*/) DIR_WITH_SLASH="$INPUT_DIR" ;;
*) DIR_WITH_SLASH="${INPUT_DIR}/" ;;
esac
ABS_DIR="$(readlink -f "$DIR_WITH_SLASH")"
BASE_NAME="$(basename "$ABS_DIR")"
STAMP="$(date +'%Y%m%d-%H%M%S')"
ARCHIVE_NAME="${BASE_NAME}-${STAMP}.tar.zst"
PARENT_DIR="$(dirname "$ABS_DIR")"
CHILD_NAME="$(basename "$ABS_DIR")"
# Compute size in bytes
DIR_SIZE_BYTES="$(du -sb -- "$ABS_DIR" | awk '{print $1}')"
SMALL_BYTES="$(bytes_from_gib "$SMALL_GB")"
MEDIUM_BYTES="$(bytes_from_gib "$MEDIUM_GB")"
# -------- Heuristic: threads & long window -----------------------------------
CORES="$(nproc)"
THREADS=1
USE_LONG=0
USE_PAR=1
if (( DIR_SIZE_BYTES < SMALL_BYTES )); then
THREADS=1
elif (( DIR_SIZE_BYTES < MEDIUM_BYTES )); then
HALF=$(( CORES / 2 ))
if (( HALF < 2 )); then
THREADS=$(( CORES > 1 ? 2 : 1 ))
else
THREADS="$HALF"
fi
else
THREADS="$CORES"
USE_LONG=1
fi
ZSTD_ARGS=()
if (( USE_LONG == 1 )); then
ZSTD_ARGS+=( "--long=30" )
fi
ZSTD_ARGS+=( "-T${THREADS}" "-${ZSTD_LEVEL}" )
# -------- Summary ------------------------------------------------------------
echo "Archiving with Zstandard:"
echo " Source dir: $ABS_DIR"
echo " Size (bytes): $DIR_SIZE_BYTES"
echo " Cores detected: $CORES"
echo " Threads: $THREADS"
echo " Long window: $([[ $USE_LONG -eq 1 ]] && echo yes || echo no)"
echo " PAR2 recovery: $([[ $USE_PAR -eq 1 ]] && echo yes || echo no)"
echo " Zstd level: -${ZSTD_LEVEL}"
echo " Output file: $ARCHIVE_NAME"
# -------- Archive creation ---------------------------------------------------
tar -C "$PARENT_DIR" -I "zstd ${ZSTD_ARGS[*]}" -cf "$ARCHIVE_NAME" "${CHILD_NAME}/"
echo "Verifying archive table of contents..."
tar -I zstd -tf "$ARCHIVE_NAME" >/dev/null
# -------- Optional PAR2 generation -------------------------------------------
if (( USE_PAR == 1 )); then
if command -v par2 >/dev/null 2>&1; then
echo "Creating PAR2 recovery files (${PAR_REDUNDANCY}% redundancy)..."
par2 create -q -r"${PAR_REDUNDANCY}" "$ARCHIVE_NAME"
echo "PAR2 recovery files created."
else
echo "PAR2 not installed; skipping recovery file generation."
fi
fi
echo "Done: ${ARCHIVE_NAME}"
@svandragt
Copy link
Author

svandragt commented Dec 7, 2025

handy for stashing web development projects!

Update: added par2 -- adding a small (5–10 %) PAR set is worthwhile if the archive is long-term or remote. It’s cheap insurance against silent data corruption.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment