Skip to content

Instantly share code, notes, and snippets.

@tigrouind
Last active August 4, 2025 08:22
Show Gist options
  • Save tigrouind/123f2bc6512e0f560cc857f4f46c4b30 to your computer and use it in GitHub Desktop.
Save tigrouind/123f2bc6512e0f560cc857f4f46c4b30 to your computer and use it in GitHub Desktop.
Extract JPEG images from a PDF.
import fitz #pip install PyMuPDF
import sys
import os
import zipfile
def extract_jpg_images_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
images = []
for page_number in range(len(doc)):
page = doc[page_number]
for img in page.get_images(full=True):
xref = img[0]
base_image = doc.extract_image(xref)
img_bytes = base_image["image"]
img_ext = base_image["ext"]
if img_ext.lower() == "jpg" or img_ext.lower() == "jpeg":
images.append((img_bytes, img_ext))
return images
def save_images_and_zip(images, pdf_path):
base = os.path.splitext(os.path.basename(pdf_path))[0]
temp_dir = f"{base}"
os.makedirs(temp_dir, exist_ok=True)
img_paths = []
for i, (img_bytes, img_ext) in enumerate(images):
img_filename = f"{base}_{i+1}.{img_ext}"
img_path = os.path.join(temp_dir, img_filename)
with open(img_path, "wb") as img_file:
img_file.write(img_bytes)
img_paths.append(img_path)
def main():
if len(sys.argv) != 2:
print("Usage: python extract_jpgs_from_pdf.py <folder>")
sys.exit(1)
pdf_path = sys.argv[1]
for filename in os.listdir(pdf_path):
if filename.endswith(".pdf"):
print(filename)
images = extract_jpg_images_from_pdf(filename)
if not images:
print("No JPG images found in PDF.")
sys.exit(0)
save_images_and_zip(images, filename)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment