Last active
January 14, 2022 00:38
-
-
Save flacle/09c7ba2386f60a1f3b60ac6b6482cdb4 to your computer and use it in GitHub Desktop.
Extract images from Powerpoint presentation files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Francis Laclé | |
# 13/01/2022 | |
# Extract images from Powerpoint presentation files. | |
# We assume PPT(X)'s are in subdirectories and we scan one level deep. | |
# In case more depth is required, change the recursive flag to True on line 25. | |
# Objective: | |
# From the root directory, scan for PPT(X) files in sub directories (1-level) | |
# Then, for each PPT(X) open it, extract the images and save them on the disk | |
from glob import glob | |
from pptx import Presentation | |
from pptx.enum.shapes import MSO_SHAPE_TYPE | |
def iter_picture_shapes(prs): | |
for slide in prs.slides: | |
for shape in slide.shapes: | |
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: | |
yield shape | |
# Specify a "root" directory as the starting point | |
def main_func(root_dir): | |
# from https://stackoverflow.com/a/36426997/861597 | |
path = root_dir | |
dirs = glob(root_dir + "/*/", recursive = False) | |
for dir in dirs: | |
files = glob(dir+"*.ppt*") | |
for filename in files: | |
# adapted from https://stackoverflow.com/a/52504408/861597 | |
c = 0 | |
for picture in iter_picture_shapes(Presentation(filename)): | |
image = picture.image | |
# ---get image "file" contents--- | |
image_bytes = image.blob | |
image_filename = dir + str(c) + '_' + image.filename | |
with open(image_filename, 'wb') as f: | |
f.write(image_bytes) | |
c = c + 1 | |
main_func('./some/dir/path/string') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment