Created
April 29, 2025 14:44
-
-
Save lesleyodu/e5efc43646df58db1a93c0ddbe7f8d8f to your computer and use it in GitHub Desktop.
grab-cdx2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# grab-cdx.py MCW | |
from requests import Session | |
from rich.console import Console | |
from urllib.parse import urlencode | |
from random import randint | |
from time import sleep | |
URIR = "https://www.samhsa.gov" | |
NICKNAME = "samhsaall" | |
FROM = "1996" | |
TO = "2025" | |
OTHER_PARAMS = "&from=" + FROM + "&to=" + TO #+ "&collapse=timestamp:8&filter=statuscode:200" # only one entry per day, 200 OK | |
REQSESSION = Session() | |
errprint = Console(stderr=True, style="red", highlight=False).print | |
# HELPFUL FUNCTION FROM CDXSUMMARY | |
# https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/__main__.py | |
def get_stream_from_api(url, startPage=0): | |
pages = int(REQSESSION.get(f"{url}&showNumPages=true").text) | |
for page in range(startPage, pages, 5): | |
pageurl = f"{url}&page={page}&pageSize=5" | |
errprint(f"Downloading [[cyan]{page + 1}/{pages}[/cyan]]: [magenta]{pageurl}[/magenta]") | |
r = REQSESSION.get(pageurl, stream=True) | |
if r.ok: | |
r.raw.decode_content = True | |
for line in r.raw: yield line | |
# Sleep a random number of seconds (between 1 and 5) | |
sleep(randint(8,11)) | |
def write_cdx (urir, cdxapi, params, outfile, startPage=0): | |
url = f"{cdxapi}?{params}&{urlencode({'url': urir})}" | |
input_stream = get_stream_from_api(url, startPage) | |
f = open(outfile, "w") | |
for line in input_stream: | |
f.write(line.decode()) | |
f.flush() | |
f.close() | |
try: input_stream.close() | |
except: pass | |
# MAIN | |
cdxapi = "https://web.archive.org/cdx/search" | |
params = "matchType=domain" + OTHER_PARAMS | |
start = 340 | |
outfile = NICKNAME + "-" + FROM + "-" + TO + "-day-" + str(start) + ".cdx" | |
write_cdx(URIR, cdxapi, params, outfile, start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment