edsu · September 2, 2023 18:43
diff --git a/extract_images.py b/extract_images.py
 #!/usr/bin/env python3

 import sys
 import pathlib

 from urllib.parse import urlparse
 from warcio.archiveiterator import ArchiveIterator

 def save(url, stream):
    uri = urlparse(url)
    path = pathlib.Path(uri.netloc + '/' + uri.path)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.open('wb').write(stream.read())
    print(path)

 def extract_images(warc_file):
    with open(warc_file, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':
                url = record.rec_headers.get_header('WARC-Target-URI')
                content_type = record.http_headers.get_header('Content-Type')
                if 'image' in content_type:
                    save(url, record.content_stream())

 if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit('usage: extract_images.py <warc_file>')
    warc_file = sys.argv[1]
    extract_images(warc_file)
	#!/usr/bin/env python3

	import sys
	import pathlib

	from urllib.parse import urlparse
	from warcio.archiveiterator import ArchiveIterator

	def save(url, stream):
	uri = urlparse(url)
	path = pathlib.Path(uri.netloc + '/' + uri.path)
	path.parent.mkdir(parents=True, exist_ok=True)
	path.open('wb').write(stream.read())
	print(path)

	def extract_images(warc_file):
	with open(warc_file, 'rb') as stream:
	for record in ArchiveIterator(stream):
	if record.rec_type == 'response':
	url = record.rec_headers.get_header('WARC-Target-URI')
	content_type = record.http_headers.get_header('Content-Type')
	if 'image' in content_type:
	save(url, record.content_stream())

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	sys.exit('usage: extract_images.py <warc_file>')
	warc_file = sys.argv[1]
	extract_images(warc_file)
No results found