Last active
September 15, 2020 03:59
-
-
Save Beej126/a38495a051cf06c47c6bf84db6dbbf9a to your computer and use it in GitHub Desktop.
Internet Archive (aka Wayback Machine) blog image recovery
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# start with web archive's interactive search to hone in on the urls we need... | |
# e.g. https://web.archive.org/web/*/https://www.BeejBlog.com/wp-content/uploads//* | |
# viewing the ajax behind the search page (browser tools > network tab) we see the json feed for flat list of images & crucial wayback datestamp | |
# https://web.archive.org/cdx/search?url=https%3A%2F%2Fwww.BeejBlog.com%2Fwp-content%2Fuploads%2F%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Ctimestamp%2Cendtimestamp | |
# click into the first search result and viewing the page source we see the format for direct image urls: | |
# e.g. https://web.archive.org/web/20160601110654if_/http://www.beejblog.com/wp-content/uploads/2008/08/204996604.jpg | |
# download the json list of our urls | |
$urls = (curl 'https://web.archive.org/cdx/search?url=https%3A%2F%2Fwww.BeejBlog.com%2Fwp-content%2Fuploads%2F%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Ctimestamp%2Cendtimestamp' | convertfrom-json) | |
$webclient = New-Object System.Net.WebClient | |
# iterate the list... | |
$urls | | |
# skip the first line of headers | |
select -skip 1 | %{ | |
#split each entry on carriage return to get url and datestamp | |
$props=$_ -split '\r' # [0] = url, [1] = datestamp | |
#notice the url format of direct url image shown above: | |
#in my case i want to extract the blog folder these images used to be in...e.g. "2008/08/204996604.jpg" | |
# so the existing posts referencing these files don't need to have their urls fixed | |
#split the url on a known phrase just prior to the folders | |
#then split on the folder slash | |
$folders = ($props[0] -split "wp-content/uploads")[1] -split "/" | |
# we'll wind up with $folders[1] = "2008" folder, [2] = "08" subfolder, [3] = filename | |
#create the folders (if not exists) | |
if(!(Test-Path -Path "$($folders[1])\$($folders[2])")) { md "$($folders[1])\$($folders[2])" } | |
$target = "$($pwd)\$($folders[1])\$($folders[2])\$($folders[3])" | |
#download the file | |
echo target: $target | |
# only if it doesn't exist already (from previous runs of this script for example) | |
if(!(Test-Path -Path $target)) { | |
$downloadUrl = "https://web.archive.org/web/$($props[1])if_/$($props[0])" | |
echo downloading: $downloadUrl | |
#i really don't have a reason for using .net framework webclient vs powershell's invoke-webrequest | |
$webclient.DownloadFile($downloadUrl, $target) | |
} | |
else { | |
echo skipping | |
} | |
sleep 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment