-
-
Save jvillemare/81887e9c53253c16e7ce0f9c60250779 to your computer and use it in GitHub Desktop.
import os # for magick and tesseract commands | |
import time # for epoch time | |
import calendar # for epoch time | |
from PyPDF2 import PdfFileMerger | |
dir_files = [f for f in os.listdir(".") if os.path.isfile(os.path.join(".", f))] | |
epoch_time = int(calendar.timegm(time.gmtime())) | |
print(dir_files) | |
for file in dir_files: # look at every file in the current directory | |
if file.endswith('.pdf'): # if it is a PDF, use it | |
print('Working on converting: ' + file) | |
# setup | |
file = file.replace('.pdf', '') # get just the filepath without the extension | |
folder = str(int(epoch_time)) + '_' + file # generate a folder name for temporary images | |
combined = folder + '/' + file # come up with temporary export path | |
# create folder | |
if not os.path.exists(folder): # make the temporary folder | |
os.makedirs(folder) | |
# convert PDF to PNG(s) | |
magick = 'convert -density 150 "' + file + '.pdf" "' + combined + '-%04d.png"' | |
print(magick) | |
os.system(magick) | |
# convert PNG(s) to PDF(s) with OCR data | |
pngs = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] | |
for pic in pngs: | |
if pic.endswith('.png'): | |
combined_pic = folder + '/' + pic | |
print(combined_pic) | |
tesseract = 'tesseract "' + combined_pic + '" "' + combined_pic + '-ocr" PDF' | |
print(tesseract) | |
os.system(tesseract) | |
# combine OCR'd PDFs into one | |
ocr_pdfs = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] | |
merger = PdfFileMerger() | |
for pdf in ocr_pdfs: | |
if pdf.endswith('.pdf'): | |
merger.append(folder + '/' + pdf) | |
merger.write(file + '-ocr-combined.pdf') | |
merger.close() |
I don't think I'll be able to help you. I'm a very busy individual at the moment. It sounds like you almost got it. A bit more tinkering and you'll probably figure it out. Good luck!
Thank you. The video is very helpful
What if I want txt instead of PDF?
Thanks again.
Hi, how do i add . tif files to your code with Imagemagick?
Hey guys, pay attention to the command pdffilemerger,it is deprecated.
Change "from PyPDF2 import PdfMerger" to "from PyPDF2 import PdfMerger" and "PdfFileMerger()" to "PdfMerger()"
getting this error
PS D:> python ./convert.py
['1.pdf', 'convert.py']
Working on converting: 1.pdf
convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
Invalid Parameter - 150
getting this error PS D:> python ./convert.py ['1.pdf', 'convert.py'] Working on converting: 1.pdf convert -density 150 "1.pdf" "1738563963_1/1-%04d.png" Invalid Parameter - 150
apparently the new way to use imagemagick is
magick convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
instead of
convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
in line 18 of the script should be
magick = 'magick convert -density 150 "' + file + '.pdf" "' + combined + '-%04d.png"'
if it still shows error try just removing the convert and use only 'magick -density....etc'
@BromTeque
hi , I want to run the script for pdf pages containing both english and a non-english language. i realized it can be done using the '+' parameter between the language codes, however it also assigns relative priority to the languages (based on which is mentioned before & after the + symbol).. the way to resolve that is probably using langdetect but I can't figure out how to code that .. can you please help me ?