portableant · February 13, 2021 00:10
diff --git a/splitPdf.py b/splitPdf.py
 #!/usr/bin/python
 ## Split pdf files into pages
 ## Daniel Pett 11/2/2021
 __author__ = 'portableant'
 ## Tested on Python 2.7.16


 ## Usage example for Lucinda
 ## Save this file on your machine, make sure you have python installed. 
 ## You may need to have installed libraries to run this file eg
 ## pip install wand
 ## pip install PyPDF2
 ## Save your pdf file in the same directory as the python script
 ## This script takes 4 arguments as defined below

 ## An example to run it is:
 ## python splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed
 ## What does the above do - 1. Ask python to run the script 2. -p . means the path is current 
 ## directory 3. -f means the file name to split 4. -d means the directory to create 5. -n is the base
 ## file name for the new files to be named

 import argparse
 import os

 from PyPDF2 import PdfFileWriter, PdfFileReader

 from wand.image import Image

 parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')

 # Add arguments
 parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
 parser.add_argument('-f', '--file', help='The file to process', required=True)
 parser.add_argument('-n', '--name', help='The new file name', required=True)
 parser.add_argument('-d', '--destination', help='The processed folder', required=True)

 # Parse the arguments
 args = parser.parse_args()

 # Construct variables and print them out
 path = args.path
 print(path)
 destination = os.path.join(path,args.destination)
 print(destination)
 pageName = os.path.join(destination,args.name)+'_%s.pdf'
 print(pageName)
 fileName = os.path.join(path,args.file)
 print(fileName)

 # Make the directory if it does not exist
 if not os.path.exists(destination):
    os.makedirs(destination)

 # Open the file
 inputpdf = PdfFileReader(open( fileName, "rb"))

 # Cycle through pages and create new pdfs
 for i in range(inputpdf.numPages):
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(i))
    with open(pageName % (i+1), "wb") as outputStream:
        output.write(outputStream)
diff --git a/splitPDFandOCR.py b/splitPDFandOCR.py
 #!/usr/bin/python
 ## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo)
 ## Daniel Pett 11/2/2021
 __author__ = 'portableant'
 ## Tested on Python 2.7.16
 ## Usage example
 ## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
 ## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc

 import argparse
 import os
 import sys
 # pip install Pillow
 from PIL import Image

 # pip3 install pytesseract
 import pytesseract
 pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


 # pip3 install PyPDF2
 from PyPDF2 import PdfFileWriter, PdfFileReader
 # pip3 install pdf2image
 from pdf2image import convert_from_path
 # pip install wand
 #from wand.image import Image

 parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')
 parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
 parser.add_argument('-f', '--file', help='The file to process', required=True)
 parser.add_argument('-n', '--name', help='The new file name', required=True)
 parser.add_argument('-d', '--destination', help='The processed folder', required=True)
 parser.add_argument('-o', '--ocr', help='The ocr folder', required=True)

 # Parse arguments

 args = parser.parse_args()

 path = args.path

 destination = os.path.join(path,args.destination)

 ocrfolder = os.path.join(path,args.ocr)

 pageName = os.path.join(destination,args.name)+'%s.pdf'

 fileName = os.path.join(path,args.file)

 if not os.path.exists(destination):
    os.makedirs(destination)

 if not os.path.exists(ocrfolder):
    os.makedirs(ocrfolder)

 if not os.path.exists('images'):
    os.makedirs('images')

 inputpdf = PdfFileReader(open( fileName, "rb"))

 for i in range(inputpdf.numPages):
    output = PdfFileWriter()
    output.addPage(inputpdf.getPage(i))
    with open(pageName % (i+1), "wb") as outputStream:
        output.write(outputStream)

 for file in os.listdir(destination):
     filepath = os.path.join(destination,file)
     if file.endswith(".pdf"):
        img = convert_from_path(filepath)
        imgName = os.path.splitext(file)[0]
        jpgName = os.path.join('./images/',imgName + '.jpg')
        for page in img:
          page.save(jpgName, 'JPEG')
          text = pytesseract.image_to_string(Image.open(jpgName))
          ocrName = os.path.join('./ocr/',imgName + '.txt')
          with open(ocrName, mode = 'w') as f:
            f.write(text)
	#!/usr/bin/python
	## Split pdf files into pages
	## Daniel Pett 11/2/2021
	__author__ = 'portableant'
	## Tested on Python 2.7.16


	## Usage example for Lucinda
	## Save this file on your machine, make sure you have python installed.
	## You may need to have installed libraries to run this file eg
	## pip install wand
	## pip install PyPDF2
	## Save your pdf file in the same directory as the python script
	## This script takes 4 arguments as defined below

	## An example to run it is:
	## python splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed
	## What does the above do - 1. Ask python to run the script 2. -p . means the path is current
	## directory 3. -f means the file name to split 4. -d means the directory to create 5. -n is the base
	## file name for the new files to be named

	import argparse
	import os

	from PyPDF2 import PdfFileWriter, PdfFileReader

	from wand.image import Image

	parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')

	# Add arguments
	parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
	parser.add_argument('-f', '--file', help='The file to process', required=True)
	parser.add_argument('-n', '--name', help='The new file name', required=True)
	parser.add_argument('-d', '--destination', help='The processed folder', required=True)

	# Parse the arguments
	args = parser.parse_args()

	# Construct variables and print them out
	path = args.path
	print(path)
	destination = os.path.join(path,args.destination)
	print(destination)
	pageName = os.path.join(destination,args.name)+'_%s.pdf'
	print(pageName)
	fileName = os.path.join(path,args.file)
	print(fileName)

	# Make the directory if it does not exist
	if not os.path.exists(destination):
	os.makedirs(destination)

	# Open the file
	inputpdf = PdfFileReader(open( fileName, "rb"))

	# Cycle through pages and create new pdfs
	for i in range(inputpdf.numPages):
	output = PdfFileWriter()
	output.addPage(inputpdf.getPage(i))
	with open(pageName % (i+1), "wb") as outputStream:
	output.write(outputStream)
	#!/usr/bin/python
	## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo)
	## Daniel Pett 11/2/2021
	__author__ = 'portableant'
	## Tested on Python 2.7.16
	## Usage example
	## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
	## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc

	import argparse
	import os
	import sys
	# pip install Pillow
	from PIL import Image

	# pip3 install pytesseract
	import pytesseract
	pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'


	# pip3 install PyPDF2
	from PyPDF2 import PdfFileWriter, PdfFileReader
	# pip3 install pdf2image
	from pdf2image import convert_from_path
	# pip install wand
	#from wand.image import Image

	parser = argparse.ArgumentParser(description='A script for splitting pdf files into pages')
	parser.add_argument('-p', '--path', help='The path to the folder to process', required=True)
	parser.add_argument('-f', '--file', help='The file to process', required=True)
	parser.add_argument('-n', '--name', help='The new file name', required=True)
	parser.add_argument('-d', '--destination', help='The processed folder', required=True)
	parser.add_argument('-o', '--ocr', help='The ocr folder', required=True)

	# Parse arguments

	args = parser.parse_args()

	path = args.path

	destination = os.path.join(path,args.destination)

	ocrfolder = os.path.join(path,args.ocr)

	pageName = os.path.join(destination,args.name)+'%s.pdf'

	fileName = os.path.join(path,args.file)

	if not os.path.exists(destination):
	os.makedirs(destination)

	if not os.path.exists(ocrfolder):
	os.makedirs(ocrfolder)

	if not os.path.exists('images'):
	os.makedirs('images')

	inputpdf = PdfFileReader(open( fileName, "rb"))

	for i in range(inputpdf.numPages):
	output = PdfFileWriter()
	output.addPage(inputpdf.getPage(i))
	with open(pageName % (i+1), "wb") as outputStream:
	output.write(outputStream)

	for file in os.listdir(destination):
	filepath = os.path.join(destination,file)
	if file.endswith(".pdf"):
	img = convert_from_path(filepath)
	imgName = os.path.splitext(file)[0]
	jpgName = os.path.join('./images/',imgName + '.jpg')
	for page in img:
	page.save(jpgName, 'JPEG')
	text = pytesseract.image_to_string(Image.open(jpgName))
	ocrName = os.path.join('./ocr/',imgName + '.txt')
	with open(ocrName, mode = 'w') as f:
	f.write(text)