-
-
Save jvillemare/81887e9c53253c16e7ce0f9c60250779 to your computer and use it in GitHub Desktop.
import os # for magick and tesseract commands | |
import time # for epoch time | |
import calendar # for epoch time | |
from PyPDF2 import PdfFileMerger | |
dir_files = [f for f in os.listdir(".") if os.path.isfile(os.path.join(".", f))] | |
epoch_time = int(calendar.timegm(time.gmtime())) | |
print(dir_files) | |
for file in dir_files: # look at every file in the current directory | |
if file.endswith('.pdf'): # if it is a PDF, use it | |
print('Working on converting: ' + file) | |
# setup | |
file = file.replace('.pdf', '') # get just the filepath without the extension | |
folder = str(int(epoch_time)) + '_' + file # generate a folder name for temporary images | |
combined = folder + '/' + file # come up with temporary export path | |
# create folder | |
if not os.path.exists(folder): # make the temporary folder | |
os.makedirs(folder) | |
# convert PDF to PNG(s) | |
magick = 'convert -density 150 "' + file + '.pdf" "' + combined + '-%04d.png"' | |
print(magick) | |
os.system(magick) | |
# convert PNG(s) to PDF(s) with OCR data | |
pngs = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] | |
for pic in pngs: | |
if pic.endswith('.png'): | |
combined_pic = folder + '/' + pic | |
print(combined_pic) | |
tesseract = 'tesseract "' + combined_pic + '" "' + combined_pic + '-ocr" PDF' | |
print(tesseract) | |
os.system(tesseract) | |
# combine OCR'd PDFs into one | |
ocr_pdfs = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] | |
merger = PdfFileMerger() | |
for pdf in ocr_pdfs: | |
if pdf.endswith('.pdf'): | |
merger.append(folder + '/' + pdf) | |
merger.write(file + '-ocr-combined.pdf') | |
merger.close() |
Hey guys, pay attention to the command pdffilemerger,it is deprecated.
Change "from PyPDF2 import PdfMerger" to "from PyPDF2 import PdfMerger" and "PdfFileMerger()" to "PdfMerger()"
getting this error
PS D:> python ./convert.py
['1.pdf', 'convert.py']
Working on converting: 1.pdf
convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
Invalid Parameter - 150
getting this error PS D:> python ./convert.py ['1.pdf', 'convert.py'] Working on converting: 1.pdf convert -density 150 "1.pdf" "1738563963_1/1-%04d.png" Invalid Parameter - 150
apparently the new way to use imagemagick is
magick convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
instead of
convert -density 150 "1.pdf" "1738563963_1/1-%04d.png"
in line 18 of the script should be
magick = 'magick convert -density 150 "' + file + '.pdf" "' + combined + '-%04d.png"'
if it still shows error try just removing the convert and use only 'magick -density....etc'
Hi, how do i add . tif files to your code with Imagemagick?