Beej126 · September 15, 2020 03:59
diff --git a/wayback.ps1 b/wayback.ps1
 # start with web archive's interactive search to hone in on the urls we need...
 #   e.g. https://web.archive.org/web/*/https://www.BeejBlog.com/wp-content/uploads//*

 # viewing the ajax behind the search page (browser tools > network tab) we see the json feed for flat list of images & crucial wayback datestamp
 #   https://web.archive.org/cdx/search?url=https%3A%2F%2Fwww.BeejBlog.com%2Fwp-content%2Fuploads%2F%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Ctimestamp%2Cendtimestamp

 # click into the first search result and viewing the page source we see the format for direct image urls:
 #   e.g. https://web.archive.org/web/20160601110654if_/http://www.beejblog.com/wp-content/uploads/2008/08/204996604.jpg

 # download the json list of our urls
 $urls = (curl 'https://web.archive.org/cdx/search?url=https%3A%2F%2Fwww.BeejBlog.com%2Fwp-content%2Fuploads%2F%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Ctimestamp%2Cendtimestamp' | convertfrom-json) 

 $webclient = New-Object System.Net.WebClient

 # iterate the list...
 $urls | 
 # skip the first line of headers
 select -skip 1 | %{
  
  #split each entry on carriage return to get url and datestamp
  $props=$_ -split '\r' # [0] = url, [1] = datestamp

  #notice the url format of direct url image shown above:
  #in my case i want to extract the blog folder these images used to be in...e.g. "2008/08/204996604.jpg"
  # so the existing posts referencing these files don't need to have their urls fixed
  
  #split the url on a known phrase just prior to the folders
  #then split on the folder slash
  $folders = ($props[0] -split "wp-content/uploads")[1] -split "/"
  # we'll wind up with $folders[1] = "2008" folder, [2] = "08" subfolder, [3] = filename

  #create the folders (if not exists)
  if(!(Test-Path -Path "$($folders[1])\$($folders[2])")) { md "$($folders[1])\$($folders[2])" }
  
  $target = "$($pwd)\$($folders[1])\$($folders[2])\$($folders[3])"

  #download the file
  echo target: $target
  # only if it doesn't exist already (from previous runs of this script for example)
  if(!(Test-Path -Path $target)) {
    $downloadUrl = "https://web.archive.org/web/$($props[1])if_/$($props[0])"
    echo downloading: $downloadUrl
    #i really don't have a reason for using .net framework webclient vs powershell's invoke-webrequest
    $webclient.DownloadFile($downloadUrl, $target)
  }
  else {
    echo skipping
  }
  
  sleep 1
 }
	# start with web archive's interactive search to hone in on the urls we need...
	# e.g. https://web.archive.org/web//https://www.BeejBlog.com/wp-content/uploads//

	# viewing the ajax behind the search page (browser tools > network tab) we see the json feed for flat list of images & crucial wayback datestamp
	# https://web.archive.org/cdx/search?url=https%3A%2F%2Fwww.BeejBlog.com%2Fwp-content%2Fuploads%2F%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Ctimestamp%2Cendtimestamp

	# click into the first search result and viewing the page source we see the format for direct image urls:
	# e.g. https://web.archive.org/web/20160601110654if_/http://www.beejblog.com/wp-content/uploads/2008/08/204996604.jpg

	# download the json list of our urls
	$urls = (curl 'https://web.archive.org/cdx/search?url=https%3A%2F%2Fwww.BeejBlog.com%2Fwp-content%2Fuploads%2F%2F&matchType=prefix&collapse=urlkey&output=json&fl=original%2Ctimestamp%2Cendtimestamp' \| convertfrom-json)

	$webclient = New-Object System.Net.WebClient

	# iterate the list...
	$urls \|
	# skip the first line of headers
	select -skip 1 \| %{

	#split each entry on carriage return to get url and datestamp
	$props=$_ -split '\r' # [0] = url, [1] = datestamp

	#notice the url format of direct url image shown above:
	#in my case i want to extract the blog folder these images used to be in...e.g. "2008/08/204996604.jpg"
	# so the existing posts referencing these files don't need to have their urls fixed

	#split the url on a known phrase just prior to the folders
	#then split on the folder slash
	$folders = ($props[0] -split "wp-content/uploads")[1] -split "/"
	# we'll wind up with $folders[1] = "2008" folder, [2] = "08" subfolder, [3] = filename

	#create the folders (if not exists)
	if(!(Test-Path -Path "$($folders[1])\$($folders[2])")) { md "$($folders[1])\$($folders[2])" }

	$target = "$($pwd)\$($folders[1])\$($folders[2])\$($folders[3])"

	#download the file
	echo target: $target
	# only if it doesn't exist already (from previous runs of this script for example)
	if(!(Test-Path -Path $target)) {
	$downloadUrl = "https://web.archive.org/web/$($props[1])if_/$($props[0])"
	echo downloading: $downloadUrl
	#i really don't have a reason for using .net framework webclient vs powershell's invoke-webrequest
	$webclient.DownloadFile($downloadUrl, $target)
	}
	else {
	echo skipping
	}

	sleep 1
	}