Last active
December 7, 2025 09:42
-
-
Save svandragt/71d1040d1d76a143e1ecf6a0d302798e to your computer and use it in GitHub Desktop.
archive.sh — Heuristic Zstandard tar archiver
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # archive.sh — Heuristic Zstandard tar archiver with optional PAR2 redundancy | |
| # - Decides compression strategy based on directory size | |
| # - Appends trailing slash if needed | |
| # - Produces a uniquely named .tar.zst archive | |
| # - Optionally adds PAR2 recovery files for large datasets | |
| # | |
| # Usage: ./archive.sh <directory> | |
| # | |
| # Env overrides: | |
| # ZSTD_LEVEL Compression level (default: 19) | |
| # SMALL_GB Size threshold for "small" dataset (default: 1 GiB) | |
| # MEDIUM_GB Size threshold for "medium" dataset (default: 4 GiB) | |
| # PAR_REDUNDANCY % redundancy for PAR2 (default: 10) | |
| # | |
| # Heuristic: | |
| # < 1 GiB -> threads=1, no --long | |
| # 1–<4 GiB -> threads≈half cores, no --long | |
| # >= 4 GiB -> threads=all cores, --long=30 | |
| set -euo pipefail | |
| # -------- Config (overridable via env) --------------------------------------- | |
| ZSTD_LEVEL="${ZSTD_LEVEL:-19}" | |
| SMALL_GB="${SMALL_GB:-1}" | |
| MEDIUM_GB="${MEDIUM_GB:-4}" | |
| PAR_REDUNDANCY="${PAR_REDUNDANCY:-10}" | |
| # -------- Helpers ------------------------------------------------------------ | |
| usage() { | |
| echo "Usage: $0 <directory>" >&2 | |
| exit 1 | |
| } | |
| need_cmd() { | |
| command -v "$1" >/dev/null 2>&1 || { echo "Missing dependency: $1" >&2; exit 127; } | |
| } | |
| bytes_from_gib() { | |
| local gib="$1" | |
| echo $(( gib * 1024 * 1024 * 1024 )) | |
| } | |
| # -------- Preconditions ------------------------------------------------------ | |
| need_cmd tar | |
| need_cmd zstd | |
| need_cmd du | |
| need_cmd nproc | |
| need_cmd awk | |
| need_cmd date | |
| need_cmd readlink | |
| [[ $# -eq 1 ]] || usage | |
| INPUT_DIR="$1" | |
| [[ -d "$INPUT_DIR" ]] || { echo "Not a directory: $INPUT_DIR" >&2; exit 2; } | |
| # Normalize directory with trailing slash | |
| case "$INPUT_DIR" in | |
| */) DIR_WITH_SLASH="$INPUT_DIR" ;; | |
| *) DIR_WITH_SLASH="${INPUT_DIR}/" ;; | |
| esac | |
| ABS_DIR="$(readlink -f "$DIR_WITH_SLASH")" | |
| BASE_NAME="$(basename "$ABS_DIR")" | |
| STAMP="$(date +'%Y%m%d-%H%M%S')" | |
| ARCHIVE_NAME="${BASE_NAME}-${STAMP}.tar.zst" | |
| PARENT_DIR="$(dirname "$ABS_DIR")" | |
| CHILD_NAME="$(basename "$ABS_DIR")" | |
| # Compute size in bytes | |
| DIR_SIZE_BYTES="$(du -sb -- "$ABS_DIR" | awk '{print $1}')" | |
| SMALL_BYTES="$(bytes_from_gib "$SMALL_GB")" | |
| MEDIUM_BYTES="$(bytes_from_gib "$MEDIUM_GB")" | |
| # -------- Heuristic: threads & long window ----------------------------------- | |
| CORES="$(nproc)" | |
| THREADS=1 | |
| USE_LONG=0 | |
| USE_PAR=1 | |
| if (( DIR_SIZE_BYTES < SMALL_BYTES )); then | |
| THREADS=1 | |
| elif (( DIR_SIZE_BYTES < MEDIUM_BYTES )); then | |
| HALF=$(( CORES / 2 )) | |
| if (( HALF < 2 )); then | |
| THREADS=$(( CORES > 1 ? 2 : 1 )) | |
| else | |
| THREADS="$HALF" | |
| fi | |
| else | |
| THREADS="$CORES" | |
| USE_LONG=1 | |
| fi | |
| ZSTD_ARGS=() | |
| if (( USE_LONG == 1 )); then | |
| ZSTD_ARGS+=( "--long=30" ) | |
| fi | |
| ZSTD_ARGS+=( "-T${THREADS}" "-${ZSTD_LEVEL}" ) | |
| # -------- Summary ------------------------------------------------------------ | |
| echo "Archiving with Zstandard:" | |
| echo " Source dir: $ABS_DIR" | |
| echo " Size (bytes): $DIR_SIZE_BYTES" | |
| echo " Cores detected: $CORES" | |
| echo " Threads: $THREADS" | |
| echo " Long window: $([[ $USE_LONG -eq 1 ]] && echo yes || echo no)" | |
| echo " PAR2 recovery: $([[ $USE_PAR -eq 1 ]] && echo yes || echo no)" | |
| echo " Zstd level: -${ZSTD_LEVEL}" | |
| echo " Output file: $ARCHIVE_NAME" | |
| # -------- Archive creation --------------------------------------------------- | |
| tar -C "$PARENT_DIR" -I "zstd ${ZSTD_ARGS[*]}" -cf "$ARCHIVE_NAME" "${CHILD_NAME}/" | |
| echo "Verifying archive table of contents..." | |
| tar -I zstd -tf "$ARCHIVE_NAME" >/dev/null | |
| # -------- Optional PAR2 generation ------------------------------------------- | |
| if (( USE_PAR == 1 )); then | |
| if command -v par2 >/dev/null 2>&1; then | |
| echo "Creating PAR2 recovery files (${PAR_REDUNDANCY}% redundancy)..." | |
| par2 create -q -r"${PAR_REDUNDANCY}" "$ARCHIVE_NAME" | |
| echo "PAR2 recovery files created." | |
| else | |
| echo "PAR2 not installed; skipping recovery file generation." | |
| fi | |
| fi | |
| echo "Done: ${ARCHIVE_NAME}" |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
handy for stashing web development projects!
Update: added par2 -- adding a small (5–10 %) PAR set is worthwhile if the archive is long-term or remote. It’s cheap insurance against silent data corruption.