Skip to content

Instantly share code, notes, and snippets.

@dublado
Created May 29, 2024 16:26
Show Gist options
  • Save dublado/db25fcc72c45b56f24b2091803d750bf to your computer and use it in GitHub Desktop.
Save dublado/db25fcc72c45b56f24b2091803d750bf to your computer and use it in GitHub Desktop.
extract image from pdf
import os
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from PIL import Image
pptx_path="pptx.pptx"
# Unzip the pptx file to explore its content
with ZipFile(pptx_path, 'r') as zip_ref:
zip_ref.extractall("pptx_extracted")
# Path to the extracted folder
extracted_path = "pptx_extracted"
output_dir = "output"
# Function to extract images from ppt/media folder
def extract_images_from_pptx(extracted_path, output_dir):
media_path = os.path.join(extracted_path, "ppt", "media")
if os.path.exists(media_path):
for media_file in os.listdir(media_path):
if media_file.endswith(('.png', '.jpeg', '.jpg')):
img = Image.open(os.path.join(media_path, media_file))
img = img.convert("RGBA") # Ensure transparency
img.save(os.path.join(output_dir, media_file))
# Extract images
extract_images_from_pptx(extracted_path, output_dir)
# List the extracted images
extracted_images = os.listdir(output_dir)
#extracted_images
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment