Last active
January 24, 2024 04:56
-
-
Save o0-o/cac1fa05e25378bde55c085b0f1f80af to your computer and use it in GitHub Desktop.
Sync a mirror to a local directory using wget, optionally assisted by rsync.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env sh | |
# | |
# Sync a mirror to a local directory using wget, optionally assisted by | |
# rsync. | |
# | |
# If the source mirror supports rsync, or an alternative rsync mirror | |
# is supplied via $MIRROR_RSYNC_URL, wget is only used to download | |
# files (directories, hard and symbolic links, file list and deletions | |
# are generated with rsync). | |
# | |
# A lock file is used to avoid simulateous execution. If a previous | |
# wget process is actively syncing the specified mirror and appears to | |
# be healthy, a warning is printed to stderr and no files are copied | |
# or deleted. | |
# | |
# If rsync is unavailable, wget will be used exclusively similar to | |
# apt-mirror. This is very slow and some mirrors may not like it, so | |
# use at your own discretion. It is occasionally the only option, for | |
# instance, in the case of Proxmox. Fortunately, the Proxmox mirror | |
# is relatively small. | |
# | |
# Either way, this provides a way to retrieved mirror data over HTTPS | |
# instead of the more common rsync, HTTP, FTP or torrent methods. Of | |
# course, methods like PGP exist to validate data in those situations, | |
# but why not add another layer of certainty with HTTPS? | |
# | |
# In the case of packages, package managers will verify checksums and | |
# signatures automatically, but too often, no or partial validation is | |
# performed on ISOs and cloud images. | |
# | |
# Usage is very simple. Only 2 positional arguments are taken. All other | |
# configuration may be passed via environmental variables and the | |
# positional arguments may also be supplied via environmental variables. | |
# | |
# Environmental variables take precendence over positional arguments if | |
# both are present. | |
# | |
# mirror.sh MIRROR_HTTP_URL MIRROR_SUBDIR | |
# | |
# MIRROR_RSYNC_MODULE=ftp /root/mirror.sh https://ftp.usa.openbsd.org/pub/OpenBSD/ openbsd | |
# | |
######################################################################## | |
set -eu | |
set -o posix || true | |
set -o pipefail || true | |
[ -z "${MIRROR_DEBUG-}" ] || set -x | |
MIRROR_WGET_PATH="${MIRROR_WGET_PATH:-/usr/local/bin/wget}" | |
MIRROR_RSYNC_PATH="${MIRROR_RSYNC_PATH:-/usr/local/bin/rsync}" | |
MIRROR_TMP_DIR="${MIRROR_TMP_DIR:-/tmp}" | |
MIRROR_WWW_DIR="${MIRROR_WWW_DIR:-/var/www/htdocs/mirror}" | |
MIRROR_HTTP_URL="${MIRROR_HTTP_URL:-$1}" | |
MIRROR_SUBDIR="${MIRROR_SUBDIR:-$2}" | |
# MIRROR_LIMIT is passed directly to wget, so the same formatting should | |
# be used as the --limit-rate flag. See man wget for specifics. | |
MIRROR_LIMIT="${MIRROR_LIMIT:-100m}" | |
http_url="${MIRROR_HTTP_URL%/}/" | |
subdir="${MIRROR_SUBDIR%/}" | |
limit="${MIRROR_LIMIT}" | |
timestamp="$(date +%s)" | |
download_dir="${MIRROR_WWW_DIR%/}/${subdir}" | |
name="$( echo "$subdir" | sed -e 's@/@_@g' )" | |
tmp_dir="$( mktemp -p "${MIRROR_TMP_DIR}" -d "mirror_${name}.XXXXXXXXXX" )" | |
lock_file="${download_dir}/.lock" | |
cd "$tmp_dir" | |
# See if MIRROR_HTTP_URL is also a valid rsync mirror and if it is, use it. | |
[ -z "${MIRROR_RSYNC_URL-}" ] && | |
"$MIRROR_RSYNC_PATH" --version 2>&1 1>/dev/null && | |
http_base="$( echo "${http_url#*//}" | sed -e 's@/.*@@' )" && | |
"$MIRROR_RSYNC_PATH" -q --contimeout 10 rsync://"${http_base}" 2>/dev/null && | |
MIRROR_RSYNC_URL="${http_base}" || | |
[ ! -e "${download_dir}/.rsync" ] || | |
printf 'Previous mirror sync of %s used rsync but rsync is unavailable, exiting...\n' "$name" 1>&2 | |
[ -d "$download_dir" ] || | |
mkdir -p "$download_dir" || | |
{ printf 'Directory %s does not exist and could not be created\n' "$download_dir" 1>&2 | |
exit 1 | |
} | |
[ -e "$lock_file" ] && | |
{ old_pid="$(cat "$lock_file")" && | |
old_pid_stat="$(ps "$old_pid" -o stat= )" && | |
{ printf 'Previous mirror sync of %s is still running with PID %s and status %s\n' "$name" "$old_pid" "$old_pid_stat" 1>&2 | |
echo "$old_pid_stat" | grep -q '[ITZ]' && | |
{ printf 'Previous mirror sync of %s is in an unhealthy state\n' "$name" 1>&2 | |
kill -9 "$old_pid" && | |
printf 'Killed previous mirror sync of %s\n' "$name" 1>&2 && | |
rm "$lock_file" || | |
{ printf 'Failed to kill previous mirror sync of %s, exiting...\n' "$name" 1>&2 | |
exit 1 | |
} | |
} || | |
{ printf 'Previous mirror sync of %s is in a healthy state, exiting...\n' "$name" 1>&2 | |
exit 0 | |
} | |
} || printf 'Previous mirror sync of %s was interrupted\n' "$name" 1>&2 | |
} | |
# Count current files for wget pruning | |
[ ! -z "${MIRROR_RSYNC_URL-}" ] || | |
old_file_ct="$( find "$download_dir" -type f | wc -l )" | |
cut_dirs="$( echo "${http_url#*://}" | | |
sed -e 's@/$@@' | | |
( grep -o -- '/' || true ) | | |
wc -l )" | |
wget_log_file="${tmp_dir}/wget.log" | |
rsync_log_file="${tmp_dir}/rsync.log" | |
[ ! -z "${MIRROR_RSYNC_URL-}" ] || | |
{ "$MIRROR_WGET_PATH" --mirror \ | |
--timestamping \ | |
--no-parent \ | |
--no-host-directories \ | |
--progress=bar:force:noscroll \ | |
--limit-rate "$limit" \ | |
--level inf \ | |
--cut-dirs $cut_dirs \ | |
--reject "index.*,*.gif,*.changelog" \ | |
--no-cache \ | |
--no-if-modified-since \ | |
--output-file "$wget_log_file" \ | |
--directory-prefix "$download_dir" \ | |
"$http_url" & | |
wget_pid="$!" | |
} | |
[ -z "${MIRROR_RSYNC_URL-}" ] || | |
{ MIRROR_RSYNC_MODULE="${MIRROR_RSYNC_MODULE:-$subdir}" | |
"$MIRROR_RSYNC_PATH" --recursive \ | |
--times \ | |
--links \ | |
--omit-dir-times \ | |
--omit-link-times \ | |
--itemize-changes \ | |
--safe-links \ | |
--force \ | |
--no-motd \ | |
--filter "+ */" \ | |
--filter "- *" \ | |
--filter '- /project/trace/*' \ | |
"${MIRROR_RSYNC_URL}::${MIRROR_RSYNC_MODULE}" \ | |
"$download_dir" 1>\ | |
"$rsync_log_file" || | |
{ printf 'Failed to sync directory tree of mirror %s (rsync)\n' "$name" 1>&2 | |
exit 1 | |
} | |
"$MIRROR_RSYNC_PATH" --recursive \ | |
--times \ | |
--links \ | |
--hard-links \ | |
--omit-dir-times \ | |
--omit-link-times \ | |
--itemize-changes \ | |
--safe-links \ | |
--force \ | |
--no-motd \ | |
--delete \ | |
--filter '- /project/trace/*' \ | |
--dry-run \ | |
"${MIRROR_RSYNC_URL}::${MIRROR_RSYNC_MODULE}" \ | |
"$download_dir" 1>>\ | |
"$rsync_log_file" || | |
{ printf 'Failed to generate the file list of mirror %s (rsync)\n' "$name" 1>&2 | |
exit 1 | |
} | |
# Only use wget for files | |
touch "$wget_log_file" && | |
sed -ne '/^>f/ { s/^[[:graph:]]*[[:space:]]*//; p; }' "$rsync_log_file" | | |
grep -v 'project/trace' | | |
"$MIRROR_WGET_PATH" --mirror \ | |
--timestamping \ | |
--no-parent \ | |
--no-host-directories \ | |
--progress=bar:force:noscroll \ | |
--limit-rate "$limit" \ | |
--level inf \ | |
--cut-dirs $cut_dirs \ | |
--no-cache \ | |
--no-if-modified-since \ | |
--output-file "$wget_log_file" \ | |
--directory-prefix "$download_dir" \ | |
--base "$http_url" \ | |
--input-file - & | |
wget_pid="$!" | |
} | |
echo "$wget_pid" 1> "$lock_file" && | |
wait "$wget_pid" || | |
{ wget_return="$?" | |
[ "$wget_return" -eq '8' ] && | |
printf 'Encountered %s server errors during mirror sync of %s (wget), continuing...\n' \ | |
$( ( grep -F 'ERROR' "$wget_log_file" || true ) | wc -l ) \ | |
"$name" 1>&2 || | |
{ grep -qF 'No URLs found' "$wget_log_file" || | |
{ printf 'Mirror sync of %s exited with code %s (wget)\n' "$name" "$?" 1>&2 | |
exit 1 | |
} | |
} | |
} | |
# Copy links | |
[ -z "${MIRROR_RSYNC_URL-}" ] || | |
sed -ne '/^[hc.][Lf]/ { s/^[[:graph:]]*[[:space:]]*//; p; }' "$rsync_log_file" | | |
sed -e 's/[[:space:]][=-]>[[:space:]].*$//' | | |
"$MIRROR_RSYNC_PATH" --times \ | |
--links \ | |
--hard-links \ | |
--omit-link-times \ | |
--safe-links \ | |
--force \ | |
--no-motd \ | |
--files-from - \ | |
"${MIRROR_RSYNC_URL}::${MIRROR_RSYNC_MODULE}" \ | |
"${download_dir}" 1>>\ | |
"$rsync_log_file" || | |
{ printf 'Failed to copy links for mirror %s (rsync)\n' "$name" 1>&2 | |
exit 1 | |
} | |
reject_file_ct="$( ( grep 'should be rejected' "$wget_log_file" || true ) | wc -l )" | |
new_file_ct="$(( $( ( grep 'Saving to' "$wget_log_file" || true ) | wc -l ) - $reject_file_ct ))" | |
existing_file_ct="$( ( grep 'not retrieving' "$wget_log_file" || true ) | wc -l )" | |
file_ct="$(( $new_file_ct + $existing_file_ct ))" | |
[ "$new_file_ct" -eq 0 ] || | |
printf '%s new files added to mirror %s (wget)\n' "$new_file_ct" "$name" 1>&2 | |
rm_ct=-1 #lock file | |
[ ! -e "${download_dir}/.rsync" ] || | |
rm_ct=-2 | |
[ ! -e "${download_dir}/.last_sync" ] || | |
rm_ct="$(( $rm_ct - 1 ))" | |
# Pruning with only wget is complicated | |
[ ! -z "${MIRROR_RSYNC_URL-}" ] || | |
{ file_ct_diff="$(( $old_file_ct - $file_ct ))" | |
[ "$file_ct_diff" -lt 1 ] || | |
printf 'Mirror %s will have %s fewer total files after pruning (wget)\n' "$name" "$file_ct_diff" 1>&2 | |
# Don't prune if the source contained 50% or fewer files compared to the original destination | |
[ "$file_ct_diff" -lt "$(( $old_file_ct / 2 ))" ] || | |
{ printf 'File count of %s mirror sync is suspiciously low (wget), skipping prune\n' "$name" 1>&2 | |
exit 1 | |
} | |
rm_ct="$(( $rm_ct + $( find "$download_dir" -type f | | |
while read -r f; do | |
grep -q "$f" "$wget_log_file" || | |
rm -v "$f" | |
done | | |
wc -l ) ))" && | |
find "$download_dir" -type d -empty -delete && | |
{ [ "$rm_ct" -eq 0 ] || | |
printf '%s files were pruned from mirror %s (wget)\n ' "$rm_ct" "$name" 1>&2 | |
} || | |
{ printf 'Mirror prune of %s failed (wget)\n' "$name" 1>&2 | |
exit 1 | |
} | |
} | |
# Pruning with rsync is easy | |
[ -z "${MIRROR_RSYNC_URL-}" ] || | |
{ files_to_delete="$( sed -nEe '/^\*?deleting[[:space:]]/ { s@^\*?deleting[[:space:]]*@'"${download_dir}"'/@; p; }' "$rsync_log_file" )" | |
rm_ct="$(( $rm_ct + $( echo "$files_to_delete" | xargs rm -rv | wc -l ) ))" | |
# rsync log does not know about the lock file | |
{ rm "$lock_file" 2>/dev/null && | |
rm_ct="$(( $rm_ct + 1 ))" || true | |
} && | |
{ [ "$rm_ct" -eq '0' ] || | |
printf '%s files were pruned from mirror %s (rsync)\n' "$rm_ct" "$name" 1>&2 | |
touch "${download_dir}/.rsync" | |
} || | |
{ printf 'Mirror prune of %s failed (rsync)\n' "$name" 1>&2 | |
exit 1 | |
} | |
} | |
date 1> "${download_dir}/.last_sync" | |
rm -r "$tmp_dir" | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment