Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active March 19, 2025 19:58
Show Gist options
  • Save edsu/89bd2844b9d3d4536e68956b3a16eaef to your computer and use it in GitHub Desktop.
Save edsu/89bd2844b9d3d4536e68956b3a16eaef to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# The program will read WARC or WACZ data looking for Browsertrix text records
# and print them out as files using the archived URL as the path.
#
# You can run it right here from Gist using pipx:
#
# pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.warc.gz file2.warc.gz
#
# If you give it a WACZ file it will read any WARC files contained in the WACZ:
#
# pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.wacz file2.wacz
#
# /// script
# dependencies = ["warcio"]
# ///
import pathlib
import re
import sys
import urllib.parse
import zipfile
from warcio.archiveiterator import ArchiveIterator
def main():
input_files = sys.argv[1:]
if len(input_files) == 0:
sys.exit("usage: text_records file1.warc.gz file2.warc.gz file3.wacz")
for input_file in input_files:
if input_file.endswith(".warc") or input_file.endswith(".warc.gz"):
print(f"processing WARC file {input_file}")
process(open(input_file, "rb"))
elif input_file.endswith(".wacz"):
for warc_stream in get_warc_streams(input_file):
process(warc_stream)
def process(warc_stream):
for record in ArchiveIterator(warc_stream):
if record.rec_type == "resource":
target_uri = record.rec_headers.get_header("WARC-Target-URI")
record_id = re.search('<urn:uuid:(.+)>', record.rec_headers.get_header("WARC-Record-ID")).group(1)
if "urn:text:" not in target_uri:
continue
url = urllib.parse.urlparse(target_uri.lstrip("urn:text:"))
path = pathlib.Path(url.netloc + url.path + f"-{record_id}.txt")
path.parent.mkdir(parents=True, exist_ok=True)
print(f"writing {path}")
path.open("w").write(record.content_stream().read().decode("utf-8"))
def get_warc_streams(wacz_file_or_stream, original_wacz=None):
"""
Get file streams for WARCs contained in a WACZ file. If the WACZ file
contains other WACZ files those will be processed too.
"""
if isinstance(wacz_file_or_stream, str):
zip_file = zipfile.ZipFile(open(wacz_file_or_stream, "rb"))
else:
zip_file = zipfile.ZipFile(wacz_file_or_stream)
for filename in zip_file.namelist():
if filename.endswith("warc.gz"):
print(f"processing {filename} from WACZ {original_wacz or wacz_file_or_stream}")
yield zip_file.open(filename, "r")
elif filename.endswith("wacz"):
yield from get_warc_streams(zip_file.open(filename, "r"), original_wacz=wacz_file_or_stream)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment