Created
August 29, 2018 11:15
-
-
Save Arkanayan/35ed90e3c051893729cc0fc6210eff26 to your computer and use it in GitHub Desktop.
Extract image from pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM continuumio/miniconda3 | |
MAINTAINER Arka Nayan <[email protected]> | |
VOLUME ["/app"] | |
WORKDIR /app | |
RUN apt-get update && \ | |
apt-get install -y build-essential \ | |
libgtk2.0-dev libgl1-mesa-glx | |
RUN apt-get install -y tesseract-ocr libtesseract-dev poppler-utils enchant | |
RUN conda update -n base conda | |
#RUN conda install -y -c menpo opencv3 | |
COPY environment.yml /tmp/ | |
RUN conda env update --name base -f /tmp/environment.yml | |
EXPOSE 5000 | |
CMD ["python", "run.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import os | |
import tempfile | |
import subprocess | |
# Dependencies poppler-utils | |
# See Dockerfile on how to install poppler-utils and https://www.howtogeek.com/228531/ | |
# Tested on Linux | |
# Distributed with absolutely no guarantee | |
def extract_images_from_pdf(pdf_path, destination_path): | |
""" | |
Extract images from pdfs (using poppler-utils) | |
:param pdf_path: str Path to the pdf file | |
:param destination_path: str Destination directory where the extracted images are saved | |
:return: list Names of the extracted images ordered by pages | |
""" | |
args = ['pdfimages', '-png', '-p'] | |
extracted_images = [] | |
destination_path = Path(destination_path) | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
tmpdir = Path(tmpdirname) | |
pdf_filename = os.path.split(pdf_path)[-1] | |
fname, extension = os.path.splitext(pdf_filename) | |
args.extend([pdf_path, str(tmpdir / fname)]) | |
proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
if proc.returncode == 0 and not len(proc.stderr): | |
images = list(tmpdir.glob('*.*')) | |
# sort images by the order of the pages they are extracted from | |
images = sorted(images, key=path2num) | |
for idx, image in enumerate(images): | |
if image.exists(): | |
image_extension = os.path.splitext(image.parts[-1])[-1] | |
# output format: file.1.jpg, file.2.jpg | |
new_filename = fname + '.' + str(idx + 1) + image_extension | |
new_filepath = destination_path.absolute() / new_filename | |
shutil.copyfile(str(image), str(new_filepath)) | |
extracted_images.append(new_filename) | |
else: | |
raise Exception('PDF extraction error. Please make sure popper-utils is installed: see: ' | |
'https://www.howtogeek.com/228531/') | |
return extracted_images |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment