Skip to content

Instantly share code, notes, and snippets.

@basilesimon
Created August 28, 2025 12:09
Show Gist options
  • Save basilesimon/3231509bb59c5c8b6c0180f930bcf0e3 to your computer and use it in GitHub Desktop.
Save basilesimon/3231509bb59c5c8b6c0180f930bcf0e3 to your computer and use it in GitHub Desktop.
WACZ image finder

WACZ Webcam Image Extractor

A Python script to extract webcam images from WACZ (Web Archive Collection Zipped) files.

Purpose

Extracts Belarus customs webcam images (OSH1.jpg, OSH2.jpg, OSH3.jpg, OSH4.jpg) from archived web data collected between August 5-27, 2025.

Requirements

pip install warcio

Usage

python extract_webcam.py

What it does

  • Searches all .wacz files in the data/ directory
  • Extracts images matching pattern /webcam/OSH*.jpg
  • Saves images to extracted_webcam/ directory
  • Names files as {timestamp}_{image_name}.jpg

Output

The script processes WACZ files and reports progress:

Found 377 WACZ files
✓ filename1: extracted 4 images
✓ filename2: extracted 0 images
...
Extracted 1112 total images to extracted_webcam

Results

From 377 WACZ files (40GB total):

  • 1,112 webcam images extracted
  • 278 timestamps with webcam data
  • Date range: August 5-27, 2025
  • Source: customs.gov.by/webcam/ monitoring

Notes

  • Only processes files containing Belarus customs webcam URLs
  • Validates JPEG format with magic byte checking
  • Handles compressed WARC files within WACZ archives
  • Skips files without relevant webcam content
#!/usr/bin/env python3
import zipfile
import os
from pathlib import Path
from warcio.archiveiterator import ArchiveIterator
from urllib.parse import urlparse
def extract_images_from_warc(warc_stream, output_dir, wacz_name):
"""Extract webcam/OSH1.jpg images from a WARC stream."""
images = []
for record in ArchiveIterator(warc_stream):
if record.rec_type == 'response':
url = record.rec_headers.get_header('WARC-Target-URI', '')
if '/webcam/OSH' in url and url.endswith('.jpg'):
content = record.content_stream().read()
if content and content.startswith(b'\xff\xd8'): # JPEG magic bytes
image_name = Path(urlparse(url).path).name
filename = f"{wacz_name}_{image_name}"
output_path = Path(output_dir) / filename
output_path.write_bytes(content)
images.append(str(output_path))
return images
def process_wacz_file(wacz_path, output_dir):
"""Process a single WACZ file and extract webcam images."""
wacz_name = Path(wacz_path).stem
images = []
try:
with zipfile.ZipFile(wacz_path, 'r') as zf:
warc_files = [name for name in zf.namelist() if name.startswith('archive/') and '.warc' in name]
for warc_name in warc_files:
with zf.open(warc_name) as warc_stream:
images.extend(extract_images_from_warc(warc_stream, output_dir, wacz_name))
print(f"✓ {wacz_name}: extracted {len(images)} images")
return images
except Exception as e:
print(f"✗ {wacz_name}: {e}")
return []
def main():
data_dir = Path("data")
output_dir = Path("extracted_webcam")
output_dir.mkdir(exist_ok=True)
wacz_files = list(data_dir.glob("*.wacz"))
print(f"Found {len(wacz_files)} WACZ files")
all_images = []
for wacz_file in wacz_files:
images = process_wacz_file(wacz_file, output_dir)
all_images.extend(images)
print(f"\nExtracted {len(all_images)} total images to {output_dir}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment