Created
May 17, 2021 12:50
-
-
Save jvillemare/81887e9c53253c16e7ce0f9c60250779 to your computer and use it in GitHub Desktop.
Basic Python Script for running Tesseract OCR on PDFs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os # for magick and tesseract commands | |
import time # for epoch time | |
import calendar # for epoch time | |
from PyPDF2 import PdfFileMerger | |
dir_files = [f for f in os.listdir(".") if os.path.isfile(os.path.join(".", f))] | |
epoch_time = int(calendar.timegm(time.gmtime())) | |
print(dir_files) | |
for file in dir_files: # look at every file in the current directory | |
if file.endswith('.pdf'): # if it is a PDF, use it | |
print('Working on converting: ' + file) | |
# setup | |
file = file.replace('.pdf', '') # get just the filepath without the extension | |
folder = str(int(epoch_time)) + '_' + file # generate a folder name for temporary images | |
combined = folder + '/' + file # come up with temporary export path | |
# create folder | |
if not os.path.exists(folder): # make the temporary folder | |
os.makedirs(folder) | |
# convert PDF to PNG(s) | |
magick = 'convert -density 150 "' + file + '.pdf" "' + combined + '-%04d.png"' | |
print(magick) | |
os.system(magick) | |
# convert PNG(s) to PDF(s) with OCR data | |
pngs = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] | |
for pic in pngs: | |
if pic.endswith('.png'): | |
combined_pic = folder + '/' + pic | |
print(combined_pic) | |
tesseract = 'tesseract "' + combined_pic + '" "' + combined_pic + '-ocr" PDF' | |
print(tesseract) | |
os.system(tesseract) | |
# combine OCR'd PDFs into one | |
ocr_pdfs = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] | |
merger = PdfFileMerger() | |
for pdf in ocr_pdfs: | |
if pdf.endswith('.pdf'): | |
merger.append(folder + '/' + pdf) | |
merger.write(file + '-ocr-combined.pdf') | |
merger.close() |
getting this error PS D:> python ./convert.py ['1.pdf', 'convert.py'] Working on converting: 1.pdf convert -density 150 "1.pdf" "1738563963_1/1-%04d.png" Invalid Parameter - 150
apparently the new way to use imagemagick is
magick convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
instead of
convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
in line 18 of the script should be
magick = 'magick convert -density 150 "' + file + '.pdf" "' + combined + '-%04d.png"'
if it still shows error try just removing the convert and use only 'magick -density....etc'
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
getting this error
PS D:> python ./convert.py
['1.pdf', 'convert.py']
Working on converting: 1.pdf
convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
Invalid Parameter - 150