alphapapa · November 23, 2018 09:07
diff --git a/wget-page.sh b/wget-page.sh
 #!/bin/bash

 # * Defaults

 compression=xz
 subdir="web"

 # * Functions

 function debug {
    if [[ $debug ]]
    then
        function debug {
            echo "DEBUG: $@" >&2
        }
        debug "$@"
    else
        function debug {
            true
        }
    fi
 }
 function error {
    echo "ERROR: $@" >&2
    ((errors++))  # Initializes automatically
 }
 function die {
    error "$@"
    exit $errors
 }
 function usage {
    cat <<EOF
 $0 [OPTIONS] URL ...

 Download web pages at URLs with images, stylesheets, etc. to the
 current directory, optionally compressing into an archive.

 Page resources are stored in a subdirectory, and HTML files are
 symlinked to the current directory (which may cause page resources to
 not display, so it may be necessary to open the symlink targets
 directly).

 Options
  -d, --debug  Print debug info
  -h, --help   I need somebody!

  -a, --archive        Compress downloaded files into tar archive
  --archive-name NAME  Set archive name (not including extension)
  --compress-with EXT  tar-supported compression method (xz by default)

  -f, --flat  Download files into one directory instead of domain-based hierarchy.
              When not set, link HTML files into top-level subdir.

  -s, --subdir NAME  Set subdirectory name (web by default)
 EOF
 }

 # * Args

 args=$(getopt -n "$0" -o adfh -l archive,archive-name:,compress-with:,debug,flat,help,subdir: -- "$@") || exit 1
 eval set -- "$args"

 while true
 do
    case "$1" in
        -d|--debug)
            debug=true
            ;;
        -h|--help)
            usage
            exit
            ;;
        -a|--archive)
            archive=true
            ;;
        --archive-name|--subdir)
            # NOTE: We use the subdir as the archive name.
            shift
            subdir="$1"
            ;;
        --compress-with)
            shift
            compression="$1"
            ;;
        -f|--flat)
            flat=true
            ;;
        --)
            # Remaining args (required; do not remove)
            shift
            rest=("$@")
            break
            ;;
    esac

    shift
 done

 debug "ARGS: $args"
 debug "Remaining args: ${rest[@]}"

 # ** Prepare wget options

 wget_options=(
    # Don't download JavaScript
    --ignore-tags=script

    # Don't download fonts
    # TODO: Make optional; add other extensions
    --reject=ttf

    # Skip robots.txt (not only to avoid having things fail to download, but because wget saves the robots.txt files!)
    --execute robots=off

    # Give downloaded HTML files .html extension
    --adjust-extension

    # Span hosts (e.g. for forum pages that include images from image hosts)
    # TODO: Make optional
    --span-hosts

    # Convert links to work locally
    --convert-links

    # Backup original version of converted files
    # TODO: Make optional
    #--backup-converted

    # Get page resources (requisites).  This is the main option that retrieves page elements/resources.
    --page-requisites

    # Don't re-download page requisites that already exist locally and
    # are up-to-date with the server (if the server returns
    # Last-Modified).
    --timestamping

    # Save resources into "web" directory.
    --directory-prefix="$subdir"
 )

 [[ $flat ]] && wget_options+=(--no-directories)

 # * Main

 # Record whether subdir already exists.
 [[ -e "$subdir" ]] && subdir_existed=true

 # ** Download page and resources

 # Ignore non-zero exit codes from wget, which seem to be meaningless in recursive mode.
 wget "${wget_options[@]}" "${rest[@]}" || error "wget exited with: $?"

 # ** Link created HTML files

 if ! [[ $flat ]]
 then
    find "$subdir" -type f -iname "*.html" | while read file
    do
        # TODO: Allow setting filename (which means only one file can be linked, but usually that's what we want).
        filename=$(basename "$file")
        ln -srv "$file" "${subdir}/${filename}"
    done
 fi

 # ** Compress to archive file

 if [[ $archive ]]
 then
    archive_name="$(pwd)/${subdir}.tar.${compression}"

    # Don't overwrite existing file
    [[ -e $archive_name ]] && die "File exists: $archive_name"

    # Make archive from inside subdir to avoid leading path name.
    cd "$subdir" || die "Couldn't change to $subdir"
    tar --create --auto-compress --file "$archive_name" ./ || error "Unable to make archive"
    cd ..

    # Remove subdir if it didn't exist before running this script.
    if [[ $subdir_existed ]]
    then
        echo "NOT deleting existing subdir: $subdir" >&2
    else
        rm -rf "$subdir"
    fi
 fi

 exit $errors
	#!/bin/bash

	# * Defaults

	compression=xz
	subdir="web"

	# * Functions

	function debug {
	if [[ $debug ]]
	then
	function debug {
	echo "DEBUG: $@" >&2
	}
	debug "$@"
	else
	function debug {
	true
	}
	fi
	}
	function error {
	echo "ERROR: $@" >&2
	((errors++)) # Initializes automatically
	}
	function die {
	error "$@"
	exit $errors
	}
	function usage {
	cat <<EOF
	$0 [OPTIONS] URL ...

	Download web pages at URLs with images, stylesheets, etc. to the
	current directory, optionally compressing into an archive.

	Page resources are stored in a subdirectory, and HTML files are
	symlinked to the current directory (which may cause page resources to
	not display, so it may be necessary to open the symlink targets
	directly).

	Options
	-d, --debug Print debug info
	-h, --help I need somebody!

	-a, --archive Compress downloaded files into tar archive
	--archive-name NAME Set archive name (not including extension)
	--compress-with EXT tar-supported compression method (xz by default)

	-f, --flat Download files into one directory instead of domain-based hierarchy.
	When not set, link HTML files into top-level subdir.

	-s, --subdir NAME Set subdirectory name (web by default)
	EOF
	}

	# * Args

	args=$(getopt -n "$0" -o adfh -l archive,archive-name:,compress-with:,debug,flat,help,subdir: -- "$@") \|\| exit 1
	eval set -- "$args"

	while true
	do
	case "$1" in
	-d\|--debug)
	debug=true
	;;
	-h\|--help)
	usage
	exit
	;;
	-a\|--archive)
	archive=true
	;;
	--archive-name\|--subdir)
	# NOTE: We use the subdir as the archive name.
	shift
	subdir="$1"
	;;
	--compress-with)
	shift
	compression="$1"
	;;
	-f\|--flat)
	flat=true
	;;
	--)
	# Remaining args (required; do not remove)
	shift
	rest=("$@")
	break
	;;
	esac

	shift
	done

	debug "ARGS: $args"
	debug "Remaining args: ${rest[@]}"

	# ** Prepare wget options

	wget_options=(
	# Don't download JavaScript
	--ignore-tags=script

	# Don't download fonts
	# TODO: Make optional; add other extensions
	--reject=ttf

	# Skip robots.txt (not only to avoid having things fail to download, but because wget saves the robots.txt files!)
	--execute robots=off

	# Give downloaded HTML files .html extension
	--adjust-extension

	# Span hosts (e.g. for forum pages that include images from image hosts)
	# TODO: Make optional
	--span-hosts

	# Convert links to work locally
	--convert-links

	# Backup original version of converted files
	# TODO: Make optional
	#--backup-converted

	# Get page resources (requisites). This is the main option that retrieves page elements/resources.
	--page-requisites

	# Don't re-download page requisites that already exist locally and
	# are up-to-date with the server (if the server returns
	# Last-Modified).
	--timestamping

	# Save resources into "web" directory.
	--directory-prefix="$subdir"
	)

	[[ $flat ]] && wget_options+=(--no-directories)

	# * Main

	# Record whether subdir already exists.
	[[ -e "$subdir" ]] && subdir_existed=true

	# ** Download page and resources

	# Ignore non-zero exit codes from wget, which seem to be meaningless in recursive mode.
	wget "${wget_options[@]}" "${rest[@]}" \|\| error "wget exited with: $?"

	# ** Link created HTML files

	if ! [[ $flat ]]
	then
	find "$subdir" -type f -iname "*.html" \| while read file
	do
	# TODO: Allow setting filename (which means only one file can be linked, but usually that's what we want).
	filename=$(basename "$file")
	ln -srv "$file" "${subdir}/${filename}"
	done
	fi

	# ** Compress to archive file

	if [[ $archive ]]
	then
	archive_name="$(pwd)/${subdir}.tar.${compression}"

	# Don't overwrite existing file
	[[ -e $archive_name ]] && die "File exists: $archive_name"

	# Make archive from inside subdir to avoid leading path name.
	cd "$subdir" \|\| die "Couldn't change to $subdir"
	tar --create --auto-compress --file "$archive_name" ./ \|\| error "Unable to make archive"
	cd ..

	# Remove subdir if it didn't exist before running this script.
	if [[ $subdir_existed ]]
	then
	echo "NOT deleting existing subdir: $subdir" >&2
	else
	rm -rf "$subdir"
	fi
	fi

	exit $errors