Last active
August 3, 2024 00:09
-
-
Save lazanet/872f88c9874e4a7a78fd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Wayback machine downloader | |
#TODO: Remove redundancy (download only newest files in given time period - not all of them and then write over them) | |
############################ | |
clear | |
#Enter domain without http:// and www. | |
domain="google.com" | |
#Set matchType to "prefix" if you have multiple subdomains, or "exact" if you want only one page | |
matchType="domain" | |
#Set datefilter to 1 if you want to download data from specific time period | |
datefilter=0 | |
from="19700101120001" #yyyyMMddhhmmss | |
to="20000101120001" #yyyyMMddhhmmss | |
#Set this to 1 if your page has lots of captured pages with ? in url (experimental) | |
swapurlarguments=0 | |
usersign='&' #sign to replace ? with | |
############################################################## | |
# Do not edit after this point | |
############################################################## | |
#Getting snapshot list | |
full="http://web.archive.org/cdx/search/cdx?url=" | |
full+="$domain" | |
full+="&matchType=$matchType" | |
if [ $datefilter = 1 ] | |
then | |
full+="&from=$from&to=$to" | |
fi | |
full+="&output=json&fl=timestamp,original&fastLatest=true&filter=statuscode:200&collapse=original" #Form request url | |
wget $full -O rawlist.json #Get snapshot list to file rawlist.json | |
#Do parsing and downloading stuff | |
sed 's/\"//g' rawlist.json > list.json #Remove " from file for easier processing | |
rm rawlist.json #Remove unnecessary file | |
i=0; #Set file counter to 0 | |
numoflines=$(cat list.json | wc -l ) #Fill numoflines with number of files to download | |
while read line;do # For every file | |
rawcurrent="${line:1:${#line}-3}" #Remove brackets from JSON line | |
IFS=', ' read -a current <<< "$rawcurrent" #Separate timestamp and url | |
timestamp="${current[0]}" | |
originalurl="${current[1]}" | |
waybackurl="http://web.archive.org/web/$timestamp" | |
waybackurl+="id_/$originalurl" #Form request url | |
file_path="$domain/" | |
sufix="$(echo $originalurl | grep / | cut -d/ -f2- | cut -d/ -f3-)" | |
[[ $sufix = "" ]] && file_path+="index.html" || file_path+="$sufix" #Determine local filename | |
clear | |
echo " $i out of $numoflines" #Show progress | |
echo "$file_path" | |
mkdir -p -- "${file_path%/*}" && touch -- "$file_path" #Make local file for data to be written | |
wget -N $waybackurl -O $file_path #Download actual file | |
((i++)) | |
done < list.json | |
#If user chose, replace ? with usersign | |
if [ $swapurlarguments = 1 ] | |
then | |
cd $domain | |
for i in *; do mv "$i" "`echo $i | sed "s/\?/\$usersign/g"`"; done #Replace ? in filenames with usersign | |
find ./ -type f -exec sed -i "s/\?/\$usersign/g" {} \; #Replace ? in files with usersign | |
fi |
How to change it to create folder for each snapshot without overwriting?
Great script. Thanks.
Worked for me, thanks for taking the time :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It work under Windows with the GitShell.