Skip to content

Instantly share code, notes, and snippets.

@weiglemc
Created January 24, 2023 19:13
Show Gist options
  • Save weiglemc/9c58b154abe6d8ad9070332a6da4a590 to your computer and use it in GitHub Desktop.
Save weiglemc/9c58b154abe6d8ad9070332a6da4a590 to your computer and use it in GitHub Desktop.
Python script to grab raw HTML from Wayback Machine
# run from untracked/html as
# % python3 ../../capture-html.py < ../../cnn-to-request.txt >> ../../cnn-html-list.txt
import sys
import time
import requests
WAIT = 10 # seconds to wait between requests
TIMEOUT = 60 # seconds to wait for timeout
DONE_URI_LIST = "../../cnn-html-list.txt"
# setup requests headers
headers = {
'User-Agent': 'ODU WS-DL research script',
'From': '[email protected]'
}
# read in URI-Ms we've already requested
requested = []
with open (DONE_URI_LIST) as fp:
line = fp.readline()
while line:
urim = line.split(" ")[0]
requested.append(urim)
line = fp.readline()
for url in sys.stdin:
if ("id_" not in url):
# add id_ after datetime
url = url.replace("/http", "id_/http")
# check to see if we already have data on this URI-M
if (url in requested):
url = url.rstrip() # remove trailing newlines
print (url + " - already DONE", file=sys.stderr)
sys.stderr.flush()
continue
url = url.rstrip() # remove trailing newlines
print ("REQ " + url, end=' ', file=sys.stderr)
sys.stderr.flush()
try:
response = requests.get(url, headers=headers, timeout=TIMEOUT) # TIMEOUT second timeout
except requests.exceptions.RequestException as err:
print(" - EXCEPTION: ", str(err), file=sys.stderr)
sys.stderr.flush()
continue
# write url to stdout (want to redirect this to append to DONE_URI_LIST)
print (url)
sys.stdout.flush()
# construct output filename (cnn-DT.html) - DT is 14 digits
idx = url.find("id_")
if (idx == -1): continue
dt = url[idx-14:idx]
outfile = "cnn-" + dt + ".html"
# write content to file
with open (outfile, mode="w") as f:
f.write(response.text)
print (" - DONE, waiting " + str(WAIT) + " seconds... ", file=sys.stderr)
sys.stderr.flush()
time.sleep(WAIT) # wait WAIT seconds between requests
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment