Skip to content

Instantly share code, notes, and snippets.

@nealcaren
Last active February 10, 2025 18:06
Show Gist options
  • Save nealcaren/c3ddac6d3c483749fdbbea1c3497a763 to your computer and use it in GitHub Desktop.
Save nealcaren/c3ddac6d3c483749fdbbea1c3497a763 to your computer and use it in GitHub Desktop.
Extract Images from PDF
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "pymupdf",
# "pillow",
# ]
# ///
import fitz # PyMuPDF
import os
import hashlib
import io
from PIL import Image
import sys
Image.MAX_IMAGE_PIXELS = None # Disable DecompressionBombError
def get_image_hash(image_bytes):
"""Generate a hash of the image to detect duplicates."""
return hashlib.md5(image_bytes).hexdigest()
def save_image_preserving_quality(image_bytes, image_ext, image_path):
"""
Saves images in their original quality without recompression,
unless necessary for format compatibility.
"""
image = Image.open(io.BytesIO(image_bytes))
if image_ext in ["jpg", "jpeg"]:
# Save JPEG as is (no quality loss)
image.save(image_path, format="JPEG", quality=100)
elif image_ext == "png":
# Save PNG as is (no loss, with optimization)
image.save(image_path, format="PNG", optimize=True)
else:
# Save unknown formats in their best quality available
image.save(image_path)
def extract_high_quality_images_from_pdf(pdf_path, output_folder):
"""Extracts embedded images from a PDF with the highest quality possible."""
doc = fitz.open(pdf_path)
os.makedirs(output_folder, exist_ok=True)
seen_hashes = set()
image_count = 0
for page_number, page in enumerate(doc, start=1):
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Generate hash to avoid duplicates
image_hash = get_image_hash(image_bytes)
if image_hash in seen_hashes:
continue
seen_hashes.add(image_hash)
# Save without altering quality
image_filename = f"page_{page_number}_img_{img_index}.{image_ext}"
image_path = os.path.join(output_folder, image_filename)
save_image_preserving_quality(image_bytes, image_ext, image_path)
image_count += 1
print(f"Saved: {image_path}")
print(f"\nExtraction complete! {image_count} images saved in '{output_folder}'.")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python extract_images.py <PDF_FILE> [OUTPUT_FOLDER]")
sys.exit(1)
pdf_file = sys.argv[1]
# Create default output directory name based on PDF filename
default_output = os.path.splitext(os.path.basename(pdf_file))[0] + "_images"
output_dir = sys.argv[2] if len(sys.argv) > 2 else default_output
extract_high_quality_images_from_pdf(pdf_file, output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment