-
-
Save extratone/9d070abbfb4804604a5964211b04f7e0 to your computer and use it in GitHub Desktop.
Download from archive.org Wayback Machine
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
url=http://redefininggod.com | |
webarchive=https://web.archive.org | |
wget="wget -e robots=off -nv" | |
tab="$(printf '\t')" | |
additional_url=url.list | |
# Construct listing.txt from url.list | |
# The list of archived pages, including some wildcard url | |
# each line contains some fields separated by tabs: | |
# - the last capture date (opaque format, if different, the last year index file | |
# will be redownloaded) | |
# - the first capture year (hint for which is the oldest index to query) | |
# - the last capture year (hint for which is the latest index to query) | |
# - the url, starting with "/web/YYYYMMDDHHMMSS/" or "/web/*" (only) | |
: >listing.txt | |
# Add url.list to listing.txt | |
while read url; do | |
if [[ -z "$url" ]]; then continue; fi | |
if [[ $url != ${url#http*/web.archive.org} ]]; then | |
url="${url#http*/web.archive.org}" | |
elif [[ $url != ${url%/\*} ]]; then | |
mkdir -p "$(dirname "./web/*/$url")" | |
$wget "$webarchive/web/*/$url" -O "./web/*/$url.html" | |
# <listing.html fgrep 'href="/web/' | cut -d'"' -f2 >listing.txt | |
<"./web/*/$url.html" sed -r -e ' | |
/<table id="resultsUrl">/,/<\/table>/ { | |
/a href/ { | |
s/.*href="(.*)".*/\1/; | |
h | |
}; | |
/dateFrom/ { | |
s/.*([0-9]{4})<\/td>.*/\1/; | |
x; | |
H | |
}; | |
/dateTo/ { | |
s/.*>(.*)([0-9]{4})<\/td>.*/\1\2\n\2/; | |
x; | |
H | |
}; | |
/<\/tr>/ { | |
x; | |
s/(.*)\n(.*)\n(.*)\n(.*)/\1\t\3\t\2\t\4/; | |
p | |
} | |
}; | |
d' >"./web/*/$url.txt" | |
cat "./web/*/$url.txt" >>listing.txt | |
continue | |
else | |
url="/web/*/$url" | |
fi | |
printf "%s\t%s\t%s\t%s\n" "$(date)" 1996 2014 "$url" >>listing.txt | |
done <"$additional_url" | |
# Construct listing2.txt | |
# Remove the wildcard url and fetch all the versions from index | |
# Lines only contains the URL starting with "/web/YYYYMMDDHHMMSS/" (only) | |
# It may contains duplicates | |
: >listing2.txt | |
while read line; do | |
if [[ -z "$line" ]]; then continue; fi | |
#printf "%s\n" "$line" | |
oldifs="$IFS" | |
IFS="$tab" elems=($line) | |
IFS="$oldifs" | |
lastcap="${elems[0]}" | |
firstyear="${elems[1]}" | |
lastyear="${elems[2]}" | |
mainurl="${elems[3]}" | |
#echo "Main URL: $firstyear->$lastyear $mainurl" | |
if [[ $mainurl =~ '/web/*/' ]]; then | |
listing="./$mainurl.txt" | |
mkdir -p "$(dirname "$listing")" | |
: >"$listing" | |
oldlastcap="$(cat "./$mainurl.lastcap.txt" 2>/dev/null)" | |
oldlastyear="$(cat "./$mainurl.lastyear.txt" 2>/dev/null)" | |
: ${oldlastyear:=$lastyear} | |
for y in $(seq $firstyear $lastyear); do | |
u="/web/${y}0101000000*/${mainurl#/web/*/}" | |
mkdir -p "$(dirname "./$u.html")" | |
if ! [[ -s "./$u.html" ]] || ([[ $y -ge $oldlastyear ]] && [[ $lastcap != $oldlastcap ]]) ; then | |
$wget "$webarchive$u" -O "./$u.html" | |
fi | |
#<"./$u.html" egrep 'href="/web/[0-9]+\*' | sed -r 's/.*href="([^"]*)".*/\1/' >"$d/$f.txt" | |
<"./$u.html" egrep 'href="/web/[0-9]*/' | sed -r 's/.*href="([^"]*)".*/\1/' >>"$listing" | |
done | |
printf %s "$lastcap" >"./$mainurl.lastcap.txt" | |
printf %s "$lastyear" >"./$mainurl.lastyear.txt" | |
<"$listing" | sort | uniq >>listing2.txt | |
else | |
echo "$mainurl" >>listing2.txt | |
fi | |
done <listing.txt | |
# Construct listing3.txt | |
# sort, uniq, use unmodified page appending id_ to the timestamp | |
# URL must start with "/web/YYYYMMDDHHMMSSid_/" only. | |
# This is the list of files that needs to be downloaded (if not already present) | |
<listing2.txt sort | uniq | sed -r 's:^/web/([0-9]*)/:/web/\1id_/:' >listing3.txt | |
# Download listing3 | |
while read url; do | |
if [[ $url != ${url%/} ]]; then | |
f="./$url/.index" | |
else | |
f="./$url" | |
fi | |
mkdir -p "$(dirname "$f")" | |
if ! [[ -s "$f" ]]; then | |
$wget "$webarchive$url" -O "./$f" | |
fi | |
done <listing3.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment