Skip to content

Instantly share code, notes, and snippets.

@lesleyodu
Created April 29, 2025 14:44
Show Gist options
  • Save lesleyodu/e5efc43646df58db1a93c0ddbe7f8d8f to your computer and use it in GitHub Desktop.
Save lesleyodu/e5efc43646df58db1a93c0ddbe7f8d8f to your computer and use it in GitHub Desktop.
grab-cdx2
# grab-cdx.py MCW
from requests import Session
from rich.console import Console
from urllib.parse import urlencode
from random import randint
from time import sleep
URIR = "https://www.samhsa.gov"
NICKNAME = "samhsaall"
FROM = "1996"
TO = "2025"
OTHER_PARAMS = "&from=" + FROM + "&to=" + TO #+ "&collapse=timestamp:8&filter=statuscode:200" # only one entry per day, 200 OK
REQSESSION = Session()
errprint = Console(stderr=True, style="red", highlight=False).print
# HELPFUL FUNCTION FROM CDXSUMMARY
# https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/__main__.py
def get_stream_from_api(url, startPage=0):
pages = int(REQSESSION.get(f"{url}&showNumPages=true").text)
for page in range(startPage, pages, 5):
pageurl = f"{url}&page={page}&pageSize=5"
errprint(f"Downloading [[cyan]{page + 1}/{pages}[/cyan]]: [magenta]{pageurl}[/magenta]")
r = REQSESSION.get(pageurl, stream=True)
if r.ok:
r.raw.decode_content = True
for line in r.raw: yield line
# Sleep a random number of seconds (between 1 and 5)
sleep(randint(8,11))
def write_cdx (urir, cdxapi, params, outfile, startPage=0):
url = f"{cdxapi}?{params}&{urlencode({'url': urir})}"
input_stream = get_stream_from_api(url, startPage)
f = open(outfile, "w")
for line in input_stream:
f.write(line.decode())
f.flush()
f.close()
try: input_stream.close()
except: pass
# MAIN
cdxapi = "https://web.archive.org/cdx/search"
params = "matchType=domain" + OTHER_PARAMS
start = 340
outfile = NICKNAME + "-" + FROM + "-" + TO + "-day-" + str(start) + ".cdx"
write_cdx(URIR, cdxapi, params, outfile, start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment