Created
June 28, 2020 02:18
-
-
Save eggsyntax/521d99af8b609047cf8cd447c7f64eaa to your computer and use it in GitHub Desktop.
One-off script to grab latest valid archive.org capture URLs for a list of ordinary URLs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns archiver | |
(:require [clojure.string :as s])) | |
(def archive-root "https://web.archive.org/web/") | |
(def cutoff-date "20200621000000") | |
(def ssc-urls | |
"Assumes the existence of a file in the current directory named 'ssc-urls' and | |
containing a list of URLs, one per line." | |
(-> "ssc-urls" | |
slurp | |
s/split-lines)) | |
(defn timemap-url [u] | |
(str archive-root "timemap/" u)) | |
(defn timemap | |
"Takes an ordinary SSC url and returns a timemap vector from IA, ie a seq of | |
strings, each one representing one capture" | |
[u] | |
(->> u | |
timemap-url | |
slurp | |
s/split-lines)) | |
(defn timemap-line->capture-datestring | |
[l] | |
(-> l | |
(s/split #"\s+") ; -> the line is now a seq of fields) | |
second)) ; -> 2nd field is the datestring | |
(defn timemap-datestrings | |
"SSC URL -> sequence of IA-capture datestrings" | |
[u] | |
(map timemap-line->capture-datestring (timemap u))) | |
(defn earlier-than | |
"Take the easy path and compare the int values of the fixed-length | |
datestrings that archive.org uses" | |
[d1 d2] | |
(< (Long/parseLong d1) (Long/parseLong d2))) | |
(defn latest-valid-datestring | |
"Given a sequence of datestrings, returns the latest one prior to June 21 | |
(the day before Scott took the site down)" | |
[dss] | |
(last | |
(filter #(earlier-than % cutoff-date) dss))) | |
(defn datestring->archive-url | |
"Given a URL and an archive.org datestring like '20190829102638', produce a URL | |
pointing to an archive.org capture of that URL at that datetime." | |
[url datestring] | |
(str archive-root datestring "/" url)) | |
(defn convert-urls [urls] | |
(for [url urls] | |
(do (Thread/sleep 1000) ; avoid rate-limiting | |
(->> url | |
timemap-datestrings | |
latest-valid-datestring | |
(datestring->archive-url url))))) | |
;; started at 10:15 | |
;; 2 seconds each for 1025 files = 2050 seconds = ~34 minutes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment