basilesimon · August 28, 2025 12:09
diff --git a/README.md b/README.md
diff --git a/extract_webcam.py b/extract_webcam.py
 #!/usr/bin/env python3

 import zipfile
 import os
 from pathlib import Path
 from warcio.archiveiterator import ArchiveIterator
 from urllib.parse import urlparse

 def extract_images_from_warc(warc_stream, output_dir, wacz_name):
    """Extract webcam/OSH1.jpg images from a WARC stream."""
    images = []
    
    for record in ArchiveIterator(warc_stream):
        if record.rec_type == 'response':
            url = record.rec_headers.get_header('WARC-Target-URI', '')
            if '/webcam/OSH' in url and url.endswith('.jpg'):
                content = record.content_stream().read()
                if content and content.startswith(b'\xff\xd8'):  # JPEG magic bytes
                    image_name = Path(urlparse(url).path).name
                    filename = f"{wacz_name}_{image_name}"
                    output_path = Path(output_dir) / filename
                    output_path.write_bytes(content)
                    images.append(str(output_path))
    
    return images

 def process_wacz_file(wacz_path, output_dir):
    """Process a single WACZ file and extract webcam images."""
    wacz_name = Path(wacz_path).stem
    images = []
    
    try:
        with zipfile.ZipFile(wacz_path, 'r') as zf:
            warc_files = [name for name in zf.namelist() if name.startswith('archive/') and '.warc' in name]
            
            for warc_name in warc_files:
                with zf.open(warc_name) as warc_stream:
                    images.extend(extract_images_from_warc(warc_stream, output_dir, wacz_name))
        
        print(f"✓ {wacz_name}: extracted {len(images)} images")
        return images
        
    except Exception as e:
        print(f"✗ {wacz_name}: {e}")
        return []

 def main():
    data_dir = Path("data")
    output_dir = Path("extracted_webcam")
    output_dir.mkdir(exist_ok=True)
    
    wacz_files = list(data_dir.glob("*.wacz"))
    print(f"Found {len(wacz_files)} WACZ files")
    
    all_images = []
    for wacz_file in wacz_files:
        images = process_wacz_file(wacz_file, output_dir)
        all_images.extend(images)
    
    print(f"\nExtracted {len(all_images)} total images to {output_dir}")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import zipfile
	import os
	from pathlib import Path
	from warcio.archiveiterator import ArchiveIterator
	from urllib.parse import urlparse

	def extract_images_from_warc(warc_stream, output_dir, wacz_name):
	"""Extract webcam/OSH1.jpg images from a WARC stream."""
	images = []

	for record in ArchiveIterator(warc_stream):
	if record.rec_type == 'response':
	url = record.rec_headers.get_header('WARC-Target-URI', '')
	if '/webcam/OSH' in url and url.endswith('.jpg'):
	content = record.content_stream().read()
	if content and content.startswith(b'\xff\xd8'): # JPEG magic bytes
	image_name = Path(urlparse(url).path).name
	filename = f"{wacz_name}_{image_name}"
	output_path = Path(output_dir) / filename
	output_path.write_bytes(content)
	images.append(str(output_path))

	return images

	def process_wacz_file(wacz_path, output_dir):
	"""Process a single WACZ file and extract webcam images."""
	wacz_name = Path(wacz_path).stem
	images = []

	try:
	with zipfile.ZipFile(wacz_path, 'r') as zf:
	warc_files = [name for name in zf.namelist() if name.startswith('archive/') and '.warc' in name]

	for warc_name in warc_files:
	with zf.open(warc_name) as warc_stream:
	images.extend(extract_images_from_warc(warc_stream, output_dir, wacz_name))

	print(f"✓ {wacz_name}: extracted {len(images)} images")
	return images

	except Exception as e:
	print(f"✗ {wacz_name}: {e}")
	return []

	def main():
	data_dir = Path("data")
	output_dir = Path("extracted_webcam")
	output_dir.mkdir(exist_ok=True)

	wacz_files = list(data_dir.glob("*.wacz"))
	print(f"Found {len(wacz_files)} WACZ files")

	all_images = []
	for wacz_file in wacz_files:
	images = process_wacz_file(wacz_file, output_dir)
	all_images.extend(images)

	print(f"\nExtracted {len(all_images)} total images to {output_dir}")

	if __name__ == "__main__":
	main()