Last active
February 10, 2025 18:06
-
-
Save nealcaren/c3ddac6d3c483749fdbbea1c3497a763 to your computer and use it in GitHub Desktop.
Extract Images from PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "pymupdf", | |
# "pillow", | |
# ] | |
# /// | |
import fitz # PyMuPDF | |
import os | |
import hashlib | |
import io | |
from PIL import Image | |
import sys | |
Image.MAX_IMAGE_PIXELS = None # Disable DecompressionBombError | |
def get_image_hash(image_bytes): | |
"""Generate a hash of the image to detect duplicates.""" | |
return hashlib.md5(image_bytes).hexdigest() | |
def save_image_preserving_quality(image_bytes, image_ext, image_path): | |
""" | |
Saves images in their original quality without recompression, | |
unless necessary for format compatibility. | |
""" | |
image = Image.open(io.BytesIO(image_bytes)) | |
if image_ext in ["jpg", "jpeg"]: | |
# Save JPEG as is (no quality loss) | |
image.save(image_path, format="JPEG", quality=100) | |
elif image_ext == "png": | |
# Save PNG as is (no loss, with optimization) | |
image.save(image_path, format="PNG", optimize=True) | |
else: | |
# Save unknown formats in their best quality available | |
image.save(image_path) | |
def extract_high_quality_images_from_pdf(pdf_path, output_folder): | |
"""Extracts embedded images from a PDF with the highest quality possible.""" | |
doc = fitz.open(pdf_path) | |
os.makedirs(output_folder, exist_ok=True) | |
seen_hashes = set() | |
image_count = 0 | |
for page_number, page in enumerate(doc, start=1): | |
for img_index, img in enumerate(page.get_images(full=True)): | |
xref = img[0] | |
base_image = doc.extract_image(xref) | |
image_bytes = base_image["image"] | |
image_ext = base_image["ext"] | |
# Generate hash to avoid duplicates | |
image_hash = get_image_hash(image_bytes) | |
if image_hash in seen_hashes: | |
continue | |
seen_hashes.add(image_hash) | |
# Save without altering quality | |
image_filename = f"page_{page_number}_img_{img_index}.{image_ext}" | |
image_path = os.path.join(output_folder, image_filename) | |
save_image_preserving_quality(image_bytes, image_ext, image_path) | |
image_count += 1 | |
print(f"Saved: {image_path}") | |
print(f"\nExtraction complete! {image_count} images saved in '{output_folder}'.") | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
print("Usage: python extract_images.py <PDF_FILE> [OUTPUT_FOLDER]") | |
sys.exit(1) | |
pdf_file = sys.argv[1] | |
# Create default output directory name based on PDF filename | |
default_output = os.path.splitext(os.path.basename(pdf_file))[0] + "_images" | |
output_dir = sys.argv[2] if len(sys.argv) > 2 else default_output | |
extract_high_quality_images_from_pdf(pdf_file, output_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment