Created
November 23, 2018 09:07
-
-
Save alphapapa/e4fc034bccd06c0cc2fb442ef2f039dc to your computer and use it in GitHub Desktop.
Archive web pages with wget, optionally compressing with tar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# * Defaults | |
compression=xz | |
subdir="web" | |
# * Functions | |
function debug { | |
if [[ $debug ]] | |
then | |
function debug { | |
echo "DEBUG: $@" >&2 | |
} | |
debug "$@" | |
else | |
function debug { | |
true | |
} | |
fi | |
} | |
function error { | |
echo "ERROR: $@" >&2 | |
((errors++)) # Initializes automatically | |
} | |
function die { | |
error "$@" | |
exit $errors | |
} | |
function usage { | |
cat <<EOF | |
$0 [OPTIONS] URL ... | |
Download web pages at URLs with images, stylesheets, etc. to the | |
current directory, optionally compressing into an archive. | |
Page resources are stored in a subdirectory, and HTML files are | |
symlinked to the current directory (which may cause page resources to | |
not display, so it may be necessary to open the symlink targets | |
directly). | |
Options | |
-d, --debug Print debug info | |
-h, --help I need somebody! | |
-a, --archive Compress downloaded files into tar archive | |
--archive-name NAME Set archive name (not including extension) | |
--compress-with EXT tar-supported compression method (xz by default) | |
-f, --flat Download files into one directory instead of domain-based hierarchy. | |
When not set, link HTML files into top-level subdir. | |
-s, --subdir NAME Set subdirectory name (web by default) | |
EOF | |
} | |
# * Args | |
args=$(getopt -n "$0" -o adfh -l archive,archive-name:,compress-with:,debug,flat,help,subdir: -- "$@") || exit 1 | |
eval set -- "$args" | |
while true | |
do | |
case "$1" in | |
-d|--debug) | |
debug=true | |
;; | |
-h|--help) | |
usage | |
exit | |
;; | |
-a|--archive) | |
archive=true | |
;; | |
--archive-name|--subdir) | |
# NOTE: We use the subdir as the archive name. | |
shift | |
subdir="$1" | |
;; | |
--compress-with) | |
shift | |
compression="$1" | |
;; | |
-f|--flat) | |
flat=true | |
;; | |
--) | |
# Remaining args (required; do not remove) | |
shift | |
rest=("$@") | |
break | |
;; | |
esac | |
shift | |
done | |
debug "ARGS: $args" | |
debug "Remaining args: ${rest[@]}" | |
# ** Prepare wget options | |
wget_options=( | |
# Don't download JavaScript | |
--ignore-tags=script | |
# Don't download fonts | |
# TODO: Make optional; add other extensions | |
--reject=ttf | |
# Skip robots.txt (not only to avoid having things fail to download, but because wget saves the robots.txt files!) | |
--execute robots=off | |
# Give downloaded HTML files .html extension | |
--adjust-extension | |
# Span hosts (e.g. for forum pages that include images from image hosts) | |
# TODO: Make optional | |
--span-hosts | |
# Convert links to work locally | |
--convert-links | |
# Backup original version of converted files | |
# TODO: Make optional | |
#--backup-converted | |
# Get page resources (requisites). This is the main option that retrieves page elements/resources. | |
--page-requisites | |
# Don't re-download page requisites that already exist locally and | |
# are up-to-date with the server (if the server returns | |
# Last-Modified). | |
--timestamping | |
# Save resources into "web" directory. | |
--directory-prefix="$subdir" | |
) | |
[[ $flat ]] && wget_options+=(--no-directories) | |
# * Main | |
# Record whether subdir already exists. | |
[[ -e "$subdir" ]] && subdir_existed=true | |
# ** Download page and resources | |
# Ignore non-zero exit codes from wget, which seem to be meaningless in recursive mode. | |
wget "${wget_options[@]}" "${rest[@]}" || error "wget exited with: $?" | |
# ** Link created HTML files | |
if ! [[ $flat ]] | |
then | |
find "$subdir" -type f -iname "*.html" | while read file | |
do | |
# TODO: Allow setting filename (which means only one file can be linked, but usually that's what we want). | |
filename=$(basename "$file") | |
ln -srv "$file" "${subdir}/${filename}" | |
done | |
fi | |
# ** Compress to archive file | |
if [[ $archive ]] | |
then | |
archive_name="$(pwd)/${subdir}.tar.${compression}" | |
# Don't overwrite existing file | |
[[ -e $archive_name ]] && die "File exists: $archive_name" | |
# Make archive from inside subdir to avoid leading path name. | |
cd "$subdir" || die "Couldn't change to $subdir" | |
tar --create --auto-compress --file "$archive_name" ./ || error "Unable to make archive" | |
cd .. | |
# Remove subdir if it didn't exist before running this script. | |
if [[ $subdir_existed ]] | |
then | |
echo "NOT deleting existing subdir: $subdir" >&2 | |
else | |
rm -rf "$subdir" | |
fi | |
fi | |
exit $errors |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment