Last active
May 1, 2024 17:21
-
-
Save qskwood/4864e9d0077e5e3ce2214587b6131afe to your computer and use it in GitHub Desktop.
This script uses wget to generate a WARC that can be read by a player like OpenWayback and places it into a directory. It also handles creation and use of CDX indexes for de-duplication.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if [[ $# -ne 2 ]]; then | |
echo "Must only have two arguments, the URL and the collection" >&2 | |
exit 1 | |
fi | |
if [[ ! -d "/var/spool/openwayback/files2/${2}" ]]; then | |
mkdir "/var/spool/openwayback/files2/${2}" | |
fi | |
cd "/var/spool/openwayback/files2/${2}" || exit 1 | |
timestamp=$(date -u +%Y%m%d%H%M%S%N) | |
wget --delete-after \ | |
--execute robots=off \ | |
--no-directories \ | |
--no-warc-keep-log \ | |
--page-requisites \ | |
--span-hosts \ | |
--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" \ | |
--warc-cdx=on \ | |
$(if [[ -e "${2}.cdx" ]]; then echo "--warc-dedup="${2}.cdx""; fi) \ | |
--warc-file=at-${timestamp} \ | |
"${1}" #2> /dev/null | |
if [[ ! -e "${2}.cdx" ]]; then | |
cp at-${timestamp}.cdx "${2}.cdx" | |
else | |
tail -n +2 at-${timestamp}.cdx >> "${2}.cdx" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment