Last active
February 13, 2021 00:10
-
-
Save portableant/a4f242878ca863d12d811ebd8c2dea4a to your computer and use it in GitHub Desktop.
Hacky script for splitting pdf to pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
## Split pdf files into pages | |
## Daniel Pett 11/2/2021 | |
__author__ = 'portableant' | |
## Tested on Python 2.7.16 | |
## Usage example for Lucinda | |
## Save this file on your machine, make sure you have python installed. | |
## You may need to have installed libraries to run this file eg | |
## pip install wand | |
## pip install PyPDF2 | |
## Save your pdf file in the same directory as the python script | |
## This script takes 4 arguments as defined below | |
## An example to run it is: | |
## python splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed | |
## What does the above do - 1. Ask python to run the script 2. -p . means the path is current | |
## directory 3. -f means the file name to split 4. -d means the directory to create 5. -n is the base | |
## file name for the new files to be named | |
import argparse | |
import os | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
from wand.image import Image | |
parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages') | |
# Add arguments | |
parser.add_argument('-p', '--path', help='The path to the folder to process', required=True) | |
parser.add_argument('-f', '--file', help='The file to process', required=True) | |
parser.add_argument('-n', '--name', help='The new file name', required=True) | |
parser.add_argument('-d', '--destination', help='The processed folder', required=True) | |
# Parse the arguments | |
args = parser.parse_args() | |
# Construct variables and print them out | |
path = args.path | |
print(path) | |
destination = os.path.join(path,args.destination) | |
print(destination) | |
pageName = os.path.join(destination,args.name)+'_%s.pdf' | |
print(pageName) | |
fileName = os.path.join(path,args.file) | |
print(fileName) | |
# Make the directory if it does not exist | |
if not os.path.exists(destination): | |
os.makedirs(destination) | |
# Open the file | |
inputpdf = PdfFileReader(open( fileName, "rb")) | |
# Cycle through pages and create new pdfs | |
for i in range(inputpdf.numPages): | |
output = PdfFileWriter() | |
output.addPage(inputpdf.getPage(i)) | |
with open(pageName % (i+1), "wb") as outputStream: | |
output.write(outputStream) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo) | |
## Daniel Pett 11/2/2021 | |
__author__ = 'portableant' | |
## Tested on Python 2.7.16 | |
## Usage example | |
## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr | |
## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc | |
import argparse | |
import os | |
import sys | |
# pip install Pillow | |
from PIL import Image | |
# pip3 install pytesseract | |
import pytesseract | |
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract' | |
# pip3 install PyPDF2 | |
from PyPDF2 import PdfFileWriter, PdfFileReader | |
# pip3 install pdf2image | |
from pdf2image import convert_from_path | |
# pip install wand | |
#from wand.image import Image | |
parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages') | |
parser.add_argument('-p', '--path', help='The path to the folder to process', required=True) | |
parser.add_argument('-f', '--file', help='The file to process', required=True) | |
parser.add_argument('-n', '--name', help='The new file name', required=True) | |
parser.add_argument('-d', '--destination', help='The processed folder', required=True) | |
parser.add_argument('-o', '--ocr', help='The ocr folder', required=True) | |
# Parse arguments | |
args = parser.parse_args() | |
path = args.path | |
destination = os.path.join(path,args.destination) | |
ocrfolder = os.path.join(path,args.ocr) | |
pageName = os.path.join(destination,args.name)+'%s.pdf' | |
fileName = os.path.join(path,args.file) | |
if not os.path.exists(destination): | |
os.makedirs(destination) | |
if not os.path.exists(ocrfolder): | |
os.makedirs(ocrfolder) | |
if not os.path.exists('images'): | |
os.makedirs('images') | |
inputpdf = PdfFileReader(open( fileName, "rb")) | |
for i in range(inputpdf.numPages): | |
output = PdfFileWriter() | |
output.addPage(inputpdf.getPage(i)) | |
with open(pageName % (i+1), "wb") as outputStream: | |
output.write(outputStream) | |
for file in os.listdir(destination): | |
filepath = os.path.join(destination,file) | |
if file.endswith(".pdf"): | |
img = convert_from_path(filepath) | |
imgName = os.path.splitext(file)[0] | |
jpgName = os.path.join('./images/',imgName + '.jpg') | |
for page in img: | |
page.save(jpgName, 'JPEG') | |
text = pytesseract.image_to_string(Image.open(jpgName)) | |
ocrName = os.path.join('./ocr/',imgName + '.txt') | |
with open(ocrName, mode = 'w') as f: | |
f.write(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment