Skip to content

Instantly share code, notes, and snippets.

@flacle
Last active January 14, 2022 00:38
Show Gist options
  • Save flacle/09c7ba2386f60a1f3b60ac6b6482cdb4 to your computer and use it in GitHub Desktop.
Save flacle/09c7ba2386f60a1f3b60ac6b6482cdb4 to your computer and use it in GitHub Desktop.
Extract images from Powerpoint presentation files.
# Author: Francis Laclé
# 13/01/2022
# Extract images from Powerpoint presentation files.
# We assume PPT(X)'s are in subdirectories and we scan one level deep.
# In case more depth is required, change the recursive flag to True on line 25.
# Objective:
# From the root directory, scan for PPT(X) files in sub directories (1-level)
# Then, for each PPT(X) open it, extract the images and save them on the disk
from glob import glob
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
def iter_picture_shapes(prs):
for slide in prs.slides:
for shape in slide.shapes:
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
yield shape
# Specify a "root" directory as the starting point
def main_func(root_dir):
# from https://stackoverflow.com/a/36426997/861597
path = root_dir
dirs = glob(root_dir + "/*/", recursive = False)
for dir in dirs:
files = glob(dir+"*.ppt*")
for filename in files:
# adapted from https://stackoverflow.com/a/52504408/861597
c = 0
for picture in iter_picture_shapes(Presentation(filename)):
image = picture.image
# ---get image "file" contents---
image_bytes = image.blob
image_filename = dir + str(c) + '_' + image.filename
with open(image_filename, 'wb') as f:
f.write(image_bytes)
c = c + 1
main_func('./some/dir/path/string')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment