Created
January 24, 2023 19:13
-
-
Save weiglemc/9c58b154abe6d8ad9070332a6da4a590 to your computer and use it in GitHub Desktop.
Python script to grab raw HTML from Wayback Machine
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run from untracked/html as | |
# % python3 ../../capture-html.py < ../../cnn-to-request.txt >> ../../cnn-html-list.txt | |
import sys | |
import time | |
import requests | |
WAIT = 10 # seconds to wait between requests | |
TIMEOUT = 60 # seconds to wait for timeout | |
DONE_URI_LIST = "../../cnn-html-list.txt" | |
# setup requests headers | |
headers = { | |
'User-Agent': 'ODU WS-DL research script', | |
'From': '[email protected]' | |
} | |
# read in URI-Ms we've already requested | |
requested = [] | |
with open (DONE_URI_LIST) as fp: | |
line = fp.readline() | |
while line: | |
urim = line.split(" ")[0] | |
requested.append(urim) | |
line = fp.readline() | |
for url in sys.stdin: | |
if ("id_" not in url): | |
# add id_ after datetime | |
url = url.replace("/http", "id_/http") | |
# check to see if we already have data on this URI-M | |
if (url in requested): | |
url = url.rstrip() # remove trailing newlines | |
print (url + " - already DONE", file=sys.stderr) | |
sys.stderr.flush() | |
continue | |
url = url.rstrip() # remove trailing newlines | |
print ("REQ " + url, end=' ', file=sys.stderr) | |
sys.stderr.flush() | |
try: | |
response = requests.get(url, headers=headers, timeout=TIMEOUT) # TIMEOUT second timeout | |
except requests.exceptions.RequestException as err: | |
print(" - EXCEPTION: ", str(err), file=sys.stderr) | |
sys.stderr.flush() | |
continue | |
# write url to stdout (want to redirect this to append to DONE_URI_LIST) | |
print (url) | |
sys.stdout.flush() | |
# construct output filename (cnn-DT.html) - DT is 14 digits | |
idx = url.find("id_") | |
if (idx == -1): continue | |
dt = url[idx-14:idx] | |
outfile = "cnn-" + dt + ".html" | |
# write content to file | |
with open (outfile, mode="w") as f: | |
f.write(response.text) | |
print (" - DONE, waiting " + str(WAIT) + " seconds... ", file=sys.stderr) | |
sys.stderr.flush() | |
time.sleep(WAIT) # wait WAIT seconds between requests |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment