edsu · March 19, 2025 19:58
diff --git a/warc_text.py b/warc_text.py
 #!/usr/bin/env python3

 # The program will read WARC or WACZ data looking for Browsertrix text records
 # and print them out as files using the archived URL as the path.
 #
 # You can run it right here from Gist using pipx:
 #
 #     pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.warc.gz file2.warc.gz
 #
 # If you give it a WACZ file it will read any WARC files contained in the WACZ:
 #
 #     pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.wacz file2.wacz
 #
 # /// script
 # dependencies = ["warcio"]
 # ///

 import pathlib
 import re
 import sys
 import urllib.parse
 import zipfile

 from warcio.archiveiterator import ArchiveIterator


 def main():
    input_files = sys.argv[1:]
    if len(input_files) == 0:
        sys.exit("usage: text_records file1.warc.gz file2.warc.gz file3.wacz")

    for input_file in input_files:
        if input_file.endswith(".warc") or input_file.endswith(".warc.gz"):
            print(f"processing WARC file {input_file}")
            process(open(input_file, "rb"))
        elif input_file.endswith(".wacz"):
            for warc_stream in get_warc_streams(input_file):
                process(warc_stream)


 def process(warc_stream):
    for record in ArchiveIterator(warc_stream):
        if record.rec_type == "resource":
            target_uri = record.rec_headers.get_header("WARC-Target-URI")
            record_id = re.search('<urn:uuid:(.+)>', record.rec_headers.get_header("WARC-Record-ID")).group(1)

            if "urn:text:" not in target_uri:
                continue
            url = urllib.parse.urlparse(target_uri.lstrip("urn:text:"))
            path = pathlib.Path(url.netloc + url.path + f"-{record_id}.txt")
            path.parent.mkdir(parents=True, exist_ok=True)
            print(f"writing {path}")
            path.open("w").write(record.content_stream().read().decode("utf-8"))


 def get_warc_streams(wacz_file_or_stream, original_wacz=None):
    """
    Get file streams for WARCs contained in a WACZ file. If the WACZ file
    contains other WACZ files those will be processed too.
    """
    if isinstance(wacz_file_or_stream, str):
        zip_file = zipfile.ZipFile(open(wacz_file_or_stream, "rb"))
    else:
        zip_file = zipfile.ZipFile(wacz_file_or_stream)

    for filename in zip_file.namelist():
        if filename.endswith("warc.gz"):
            print(f"processing {filename} from WACZ {original_wacz or wacz_file_or_stream}")
            yield zip_file.open(filename, "r")
        elif filename.endswith("wacz"):
            yield from get_warc_streams(zip_file.open(filename, "r"), original_wacz=wacz_file_or_stream)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# The program will read WARC or WACZ data looking for Browsertrix text records
	# and print them out as files using the archived URL as the path.
	#
	# You can run it right here from Gist using pipx:
	#
	# pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.warc.gz file2.warc.gz
	#
	# If you give it a WACZ file it will read any WARC files contained in the WACZ:
	#
	# pipx run https://gist.githubusercontent.com/edsu/89bd2844b9d3d4536e68956b3a16eaef/raw/warc_text.py file1.wacz file2.wacz
	#
	# /// script
	# dependencies = ["warcio"]
	# ///

	import pathlib
	import re
	import sys
	import urllib.parse
	import zipfile

	from warcio.archiveiterator import ArchiveIterator


	def main():
	input_files = sys.argv[1:]
	if len(input_files) == 0:
	sys.exit("usage: text_records file1.warc.gz file2.warc.gz file3.wacz")

	for input_file in input_files:
	if input_file.endswith(".warc") or input_file.endswith(".warc.gz"):
	print(f"processing WARC file {input_file}")
	process(open(input_file, "rb"))
	elif input_file.endswith(".wacz"):
	for warc_stream in get_warc_streams(input_file):
	process(warc_stream)


	def process(warc_stream):
	for record in ArchiveIterator(warc_stream):
	if record.rec_type == "resource":
	target_uri = record.rec_headers.get_header("WARC-Target-URI")
	record_id = re.search('<urn:uuid:(.+)>', record.rec_headers.get_header("WARC-Record-ID")).group(1)

	if "urn:text:" not in target_uri:
	continue
	url = urllib.parse.urlparse(target_uri.lstrip("urn:text:"))
	path = pathlib.Path(url.netloc + url.path + f"-{record_id}.txt")
	path.parent.mkdir(parents=True, exist_ok=True)
	print(f"writing {path}")
	path.open("w").write(record.content_stream().read().decode("utf-8"))


	def get_warc_streams(wacz_file_or_stream, original_wacz=None):
	"""
	Get file streams for WARCs contained in a WACZ file. If the WACZ file
	contains other WACZ files those will be processed too.
	"""
	if isinstance(wacz_file_or_stream, str):
	zip_file = zipfile.ZipFile(open(wacz_file_or_stream, "rb"))
	else:
	zip_file = zipfile.ZipFile(wacz_file_or_stream)

	for filename in zip_file.namelist():
	if filename.endswith("warc.gz"):
	print(f"processing {filename} from WACZ {original_wacz or wacz_file_or_stream}")
	yield zip_file.open(filename, "r")
	elif filename.endswith("wacz"):
	yield from get_warc_streams(zip_file.open(filename, "r"), original_wacz=wacz_file_or_stream)


	if __name__ == "__main__":
	main()