Last active
April 29, 2025 14:49
-
-
Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Python script to grab data from the Internet Archive via the CDX API server, uses function from Sawood Alam's CDXSummary tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# grab-cdx.py | |
from requests import Session | |
from rich.console import Console | |
from urllib.parse import urlencode | |
from random import randint, from time import sleep | |
URIR = "https://www.cnn.com/" | |
FROM = "20150424" | |
TO = "20220923" | |
OTHER_PARAMS = "&from=" + FROM + "&to=" + TO + "&collapse=timestamp:8&filter=statuscode:200" # only one entry per day, 200 OK | |
REQSESSION = Session() | |
errprint = Console(stderr=True, style="red", highlight=False).print | |
# HELPFUL FUNCTION FROM CDXSUMMARY | |
# https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/__main__.py | |
def get_stream_from_api(url, startPage=0): | |
pages = int(REQSESSION.get(f"{url}&showNumPages=true").text) | |
for page in range(startPage, pages, 5): # grab 5 pages at a time (ht Lesley, Sawood) | |
pageurl = f"{url}&page={page}&pageSize=5" | |
errprint(f"Downloading [[cyan]{page + 1}/{pages}[/cyan]]: [magenta]{pageurl}[/magenta]") | |
r = REQSESSION.get(pageurl, stream=True) | |
if r.ok: | |
r.raw.decode_content = True | |
for line in r.raw: yield line | |
sleep(randint(8,11)) # be polite to IA (ht Lesley) | |
def write_cdx (urir, cdxapi, params, outfile): | |
url = f"{cdxapi}?{params}&{urlencode({'url': urir})}" | |
input_stream = get_stream_from_api(url) | |
f = open(outfile, "w") | |
for line in input_stream: | |
f.write(line.decode()) | |
f.flush() | |
f.close() | |
try: input_stream.close() | |
except: pass | |
# MAIN | |
cdxapi = "https://web.archive.org/cdx/search" | |
params = "matchType=exact" + OTHER_PARAMS | |
outfile = "cnn-" + FROM + "-" + TO + "-day.cdx" | |
write_cdx(URIR, cdxapi, params, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Politeness to avoid being blocked:
from random import randint, from time import sleep
last line of get_stream_from_api (inside for loop): sleep(randint(8,11))
Can also get more pages at once: (suggestion from Sawood)
for page in range(startPage, pages, 5):
pageurl = f"{url}&page={page}&pageSize=5"