Skip to content

Instantly share code, notes, and snippets.

@o0-o
Last active January 24, 2024 04:56
Show Gist options
  • Save o0-o/cac1fa05e25378bde55c085b0f1f80af to your computer and use it in GitHub Desktop.
Save o0-o/cac1fa05e25378bde55c085b0f1f80af to your computer and use it in GitHub Desktop.
Sync a mirror to a local directory using wget, optionally assisted by rsync.
#!/usr/bin/env sh
#
# Sync a mirror to a local directory using wget, optionally assisted by
# rsync.
#
# If the source mirror supports rsync, or an alternative rsync mirror
# is supplied via $MIRROR_RSYNC_URL, wget is only used to download
# files (directories, hard and symbolic links, file list and deletions
# are generated with rsync).
#
# A lock file is used to avoid simulateous execution. If a previous
# wget process is actively syncing the specified mirror and appears to
# be healthy, a warning is printed to stderr and no files are copied
# or deleted.
#
# If rsync is unavailable, wget will be used exclusively similar to
# apt-mirror. This is very slow and some mirrors may not like it, so
# use at your own discretion. It is occasionally the only option, for
# instance, in the case of Proxmox. Fortunately, the Proxmox mirror
# is relatively small.
#
# Either way, this provides a way to retrieved mirror data over HTTPS
# instead of the more common rsync, HTTP, FTP or torrent methods. Of
# course, methods like PGP exist to validate data in those situations,
# but why not add another layer of certainty with HTTPS?
#
# In the case of packages, package managers will verify checksums and
# signatures automatically, but too often, no or partial validation is
# performed on ISOs and cloud images.
#
# Usage is very simple. Only 2 positional arguments are taken. All other
# configuration may be passed via environmental variables and the
# positional arguments may also be supplied via environmental variables.
#
# Environmental variables take precendence over positional arguments if
# both are present.
#
# mirror.sh MIRROR_HTTP_URL MIRROR_SUBDIR
#
# MIRROR_RSYNC_MODULE=ftp /root/mirror.sh https://ftp.usa.openbsd.org/pub/OpenBSD/ openbsd
#
########################################################################
set -eu
set -o posix || true
set -o pipefail || true
[ -z "${MIRROR_DEBUG-}" ] || set -x
MIRROR_WGET_PATH="${MIRROR_WGET_PATH:-/usr/local/bin/wget}"
MIRROR_RSYNC_PATH="${MIRROR_RSYNC_PATH:-/usr/local/bin/rsync}"
MIRROR_TMP_DIR="${MIRROR_TMP_DIR:-/tmp}"
MIRROR_WWW_DIR="${MIRROR_WWW_DIR:-/var/www/htdocs/mirror}"
MIRROR_HTTP_URL="${MIRROR_HTTP_URL:-$1}"
MIRROR_SUBDIR="${MIRROR_SUBDIR:-$2}"
# MIRROR_LIMIT is passed directly to wget, so the same formatting should
# be used as the --limit-rate flag. See man wget for specifics.
MIRROR_LIMIT="${MIRROR_LIMIT:-100m}"
http_url="${MIRROR_HTTP_URL%/}/"
subdir="${MIRROR_SUBDIR%/}"
limit="${MIRROR_LIMIT}"
timestamp="$(date +%s)"
download_dir="${MIRROR_WWW_DIR%/}/${subdir}"
name="$( echo "$subdir" | sed -e 's@/@_@g' )"
tmp_dir="$( mktemp -p "${MIRROR_TMP_DIR}" -d "mirror_${name}.XXXXXXXXXX" )"
lock_file="${download_dir}/.lock"
cd "$tmp_dir"
# See if MIRROR_HTTP_URL is also a valid rsync mirror and if it is, use it.
[ -z "${MIRROR_RSYNC_URL-}" ] &&
"$MIRROR_RSYNC_PATH" --version 2>&1 1>/dev/null &&
http_base="$( echo "${http_url#*//}" | sed -e 's@/.*@@' )" &&
"$MIRROR_RSYNC_PATH" -q --contimeout 10 rsync://"${http_base}" 2>/dev/null &&
MIRROR_RSYNC_URL="${http_base}" ||
[ ! -e "${download_dir}/.rsync" ] ||
printf 'Previous mirror sync of %s used rsync but rsync is unavailable, exiting...\n' "$name" 1>&2
[ -d "$download_dir" ] ||
mkdir -p "$download_dir" ||
{ printf 'Directory %s does not exist and could not be created\n' "$download_dir" 1>&2
exit 1
}
[ -e "$lock_file" ] &&
{ old_pid="$(cat "$lock_file")" &&
old_pid_stat="$(ps "$old_pid" -o stat= )" &&
{ printf 'Previous mirror sync of %s is still running with PID %s and status %s\n' "$name" "$old_pid" "$old_pid_stat" 1>&2
echo "$old_pid_stat" | grep -q '[ITZ]' &&
{ printf 'Previous mirror sync of %s is in an unhealthy state\n' "$name" 1>&2
kill -9 "$old_pid" &&
printf 'Killed previous mirror sync of %s\n' "$name" 1>&2 &&
rm "$lock_file" ||
{ printf 'Failed to kill previous mirror sync of %s, exiting...\n' "$name" 1>&2
exit 1
}
} ||
{ printf 'Previous mirror sync of %s is in a healthy state, exiting...\n' "$name" 1>&2
exit 0
}
} || printf 'Previous mirror sync of %s was interrupted\n' "$name" 1>&2
}
# Count current files for wget pruning
[ ! -z "${MIRROR_RSYNC_URL-}" ] ||
old_file_ct="$( find "$download_dir" -type f | wc -l )"
cut_dirs="$( echo "${http_url#*://}" |
sed -e 's@/$@@' |
( grep -o -- '/' || true ) |
wc -l )"
wget_log_file="${tmp_dir}/wget.log"
rsync_log_file="${tmp_dir}/rsync.log"
[ ! -z "${MIRROR_RSYNC_URL-}" ] ||
{ "$MIRROR_WGET_PATH" --mirror \
--timestamping \
--no-parent \
--no-host-directories \
--progress=bar:force:noscroll \
--limit-rate "$limit" \
--level inf \
--cut-dirs $cut_dirs \
--reject "index.*,*.gif,*.changelog" \
--no-cache \
--no-if-modified-since \
--output-file "$wget_log_file" \
--directory-prefix "$download_dir" \
"$http_url" &
wget_pid="$!"
}
[ -z "${MIRROR_RSYNC_URL-}" ] ||
{ MIRROR_RSYNC_MODULE="${MIRROR_RSYNC_MODULE:-$subdir}"
"$MIRROR_RSYNC_PATH" --recursive \
--times \
--links \
--omit-dir-times \
--omit-link-times \
--itemize-changes \
--safe-links \
--force \
--no-motd \
--filter "+ */" \
--filter "- *" \
--filter '- /project/trace/*' \
"${MIRROR_RSYNC_URL}::${MIRROR_RSYNC_MODULE}" \
"$download_dir" 1>\
"$rsync_log_file" ||
{ printf 'Failed to sync directory tree of mirror %s (rsync)\n' "$name" 1>&2
exit 1
}
"$MIRROR_RSYNC_PATH" --recursive \
--times \
--links \
--hard-links \
--omit-dir-times \
--omit-link-times \
--itemize-changes \
--safe-links \
--force \
--no-motd \
--delete \
--filter '- /project/trace/*' \
--dry-run \
"${MIRROR_RSYNC_URL}::${MIRROR_RSYNC_MODULE}" \
"$download_dir" 1>>\
"$rsync_log_file" ||
{ printf 'Failed to generate the file list of mirror %s (rsync)\n' "$name" 1>&2
exit 1
}
# Only use wget for files
touch "$wget_log_file" &&
sed -ne '/^>f/ { s/^[[:graph:]]*[[:space:]]*//; p; }' "$rsync_log_file" |
grep -v 'project/trace' |
"$MIRROR_WGET_PATH" --mirror \
--timestamping \
--no-parent \
--no-host-directories \
--progress=bar:force:noscroll \
--limit-rate "$limit" \
--level inf \
--cut-dirs $cut_dirs \
--no-cache \
--no-if-modified-since \
--output-file "$wget_log_file" \
--directory-prefix "$download_dir" \
--base "$http_url" \
--input-file - &
wget_pid="$!"
}
echo "$wget_pid" 1> "$lock_file" &&
wait "$wget_pid" ||
{ wget_return="$?"
[ "$wget_return" -eq '8' ] &&
printf 'Encountered %s server errors during mirror sync of %s (wget), continuing...\n' \
$( ( grep -F 'ERROR' "$wget_log_file" || true ) | wc -l ) \
"$name" 1>&2 ||
{ grep -qF 'No URLs found' "$wget_log_file" ||
{ printf 'Mirror sync of %s exited with code %s (wget)\n' "$name" "$?" 1>&2
exit 1
}
}
}
# Copy links
[ -z "${MIRROR_RSYNC_URL-}" ] ||
sed -ne '/^[hc.][Lf]/ { s/^[[:graph:]]*[[:space:]]*//; p; }' "$rsync_log_file" |
sed -e 's/[[:space:]][=-]>[[:space:]].*$//' |
"$MIRROR_RSYNC_PATH" --times \
--links \
--hard-links \
--omit-link-times \
--safe-links \
--force \
--no-motd \
--files-from - \
"${MIRROR_RSYNC_URL}::${MIRROR_RSYNC_MODULE}" \
"${download_dir}" 1>>\
"$rsync_log_file" ||
{ printf 'Failed to copy links for mirror %s (rsync)\n' "$name" 1>&2
exit 1
}
reject_file_ct="$( ( grep 'should be rejected' "$wget_log_file" || true ) | wc -l )"
new_file_ct="$(( $( ( grep 'Saving to' "$wget_log_file" || true ) | wc -l ) - $reject_file_ct ))"
existing_file_ct="$( ( grep 'not retrieving' "$wget_log_file" || true ) | wc -l )"
file_ct="$(( $new_file_ct + $existing_file_ct ))"
[ "$new_file_ct" -eq 0 ] ||
printf '%s new files added to mirror %s (wget)\n' "$new_file_ct" "$name" 1>&2
rm_ct=-1 #lock file
[ ! -e "${download_dir}/.rsync" ] ||
rm_ct=-2
[ ! -e "${download_dir}/.last_sync" ] ||
rm_ct="$(( $rm_ct - 1 ))"
# Pruning with only wget is complicated
[ ! -z "${MIRROR_RSYNC_URL-}" ] ||
{ file_ct_diff="$(( $old_file_ct - $file_ct ))"
[ "$file_ct_diff" -lt 1 ] ||
printf 'Mirror %s will have %s fewer total files after pruning (wget)\n' "$name" "$file_ct_diff" 1>&2
# Don't prune if the source contained 50% or fewer files compared to the original destination
[ "$file_ct_diff" -lt "$(( $old_file_ct / 2 ))" ] ||
{ printf 'File count of %s mirror sync is suspiciously low (wget), skipping prune\n' "$name" 1>&2
exit 1
}
rm_ct="$(( $rm_ct + $( find "$download_dir" -type f |
while read -r f; do
grep -q "$f" "$wget_log_file" ||
rm -v "$f"
done |
wc -l ) ))" &&
find "$download_dir" -type d -empty -delete &&
{ [ "$rm_ct" -eq 0 ] ||
printf '%s files were pruned from mirror %s (wget)\n ' "$rm_ct" "$name" 1>&2
} ||
{ printf 'Mirror prune of %s failed (wget)\n' "$name" 1>&2
exit 1
}
}
# Pruning with rsync is easy
[ -z "${MIRROR_RSYNC_URL-}" ] ||
{ files_to_delete="$( sed -nEe '/^\*?deleting[[:space:]]/ { s@^\*?deleting[[:space:]]*@'"${download_dir}"'/@; p; }' "$rsync_log_file" )"
rm_ct="$(( $rm_ct + $( echo "$files_to_delete" | xargs rm -rv | wc -l ) ))"
# rsync log does not know about the lock file
{ rm "$lock_file" 2>/dev/null &&
rm_ct="$(( $rm_ct + 1 ))" || true
} &&
{ [ "$rm_ct" -eq '0' ] ||
printf '%s files were pruned from mirror %s (rsync)\n' "$rm_ct" "$name" 1>&2
touch "${download_dir}/.rsync"
} ||
{ printf 'Mirror prune of %s failed (rsync)\n' "$name" 1>&2
exit 1
}
}
date 1> "${download_dir}/.last_sync"
rm -r "$tmp_dir"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment