Last active
March 19, 2025 19:58
-
-
Save edsu/89bd2844b9d3d4536e68956b3a16eaef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# The program will read WARC or WACZ data looking for Browsertrix text records | |
# and print them out as files using the archived URL as the path. | |
# | |
# You can run it right here from Gist using pipx: | |
# | |
# pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.warc.gz file2.warc.gz | |
# | |
# If you give it a WACZ file it will read any WARC files contained in the WACZ: | |
# | |
# pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.wacz file2.wacz | |
# | |
# /// script | |
# dependencies = ["warcio"] | |
# /// | |
import pathlib | |
import re | |
import sys | |
import urllib.parse | |
import zipfile | |
from warcio.archiveiterator import ArchiveIterator | |
def main(): | |
input_files = sys.argv[1:] | |
if len(input_files) == 0: | |
sys.exit("usage: text_records file1.warc.gz file2.warc.gz file3.wacz") | |
for input_file in input_files: | |
if input_file.endswith(".warc") or input_file.endswith(".warc.gz"): | |
print(f"processing WARC file {input_file}") | |
process(open(input_file, "rb")) | |
elif input_file.endswith(".wacz"): | |
for warc_stream in get_warc_streams(input_file): | |
process(warc_stream) | |
def process(warc_stream): | |
for record in ArchiveIterator(warc_stream): | |
if record.rec_type == "resource": | |
target_uri = record.rec_headers.get_header("WARC-Target-URI") | |
record_id = re.search('<urn:uuid:(.+)>', record.rec_headers.get_header("WARC-Record-ID")).group(1) | |
if "urn:text:" not in target_uri: | |
continue | |
url = urllib.parse.urlparse(target_uri.lstrip("urn:text:")) | |
path = pathlib.Path(url.netloc + url.path + f"-{record_id}.txt") | |
path.parent.mkdir(parents=True, exist_ok=True) | |
print(f"writing {path}") | |
path.open("w").write(record.content_stream().read().decode("utf-8")) | |
def get_warc_streams(wacz_file_or_stream, original_wacz=None): | |
""" | |
Get file streams for WARCs contained in a WACZ file. If the WACZ file | |
contains other WACZ files those will be processed too. | |
""" | |
if isinstance(wacz_file_or_stream, str): | |
zip_file = zipfile.ZipFile(open(wacz_file_or_stream, "rb")) | |
else: | |
zip_file = zipfile.ZipFile(wacz_file_or_stream) | |
for filename in zip_file.namelist(): | |
if filename.endswith("warc.gz"): | |
print(f"processing {filename} from WACZ {original_wacz or wacz_file_or_stream}") | |
yield zip_file.open(filename, "r") | |
elif filename.endswith("wacz"): | |
yield from get_warc_streams(zip_file.open(filename, "r"), original_wacz=wacz_file_or_stream) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment