-
-
Save Fastidious/1b88cee54048782dbc4102a19e11d84e to your computer and use it in GitHub Desktop.
This script uses wget to generate a WARC that can be read by a player like OpenWayback and places it into a directory. It also handles creation and use of CDX indexes for de-duplication.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| if [[ $# -ne 2 ]]; then | |
| echo "Must only have two arguments, the URL and the collection" >&2 | |
| exit 1 | |
| fi | |
| if [[ ! -d "/var/spool/openwayback/files2/${2}" ]]; then | |
| mkdir "/var/spool/openwayback/files2/${2}" | |
| fi | |
| cd "/var/spool/openwayback/files2/${2}" || exit 1 | |
| timestamp=$(date -u +%Y%m%d%H%M%S%N) | |
| wget --delete-after \ | |
| --execute robots=off \ | |
| --no-directories \ | |
| --no-warc-keep-log \ | |
| --page-requisites \ | |
| --span-hosts \ | |
| --user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" \ | |
| --warc-cdx=on \ | |
| $(if [[ -e "${2}.cdx" ]]; then echo "--warc-dedup="${2}.cdx""; fi) \ | |
| --warc-file=at-${timestamp} \ | |
| "${1}" #2> /dev/null | |
| if [[ ! -e "${2}.cdx" ]]; then | |
| cp at-${timestamp}.cdx "${2}.cdx" | |
| else | |
| tail -n +2 at-${timestamp}.cdx >> "${2}.cdx" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment