Last active
April 20, 2016 13:53
-
-
Save oiva/6050045 to your computer and use it in GitHub Desktop.
PDF.py is a small script that converts scanned images to PDFs, reads the text from the PDF with Tesseract OCR, and adds the text to the PDF metadata keywords. Currently the script looks for images in the same directory and keeps the output PDF there too. Probably only works in OS X. If the text file exists already, the OCR is not run again. Ther…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Find scanned images or PDFs in directory, run Tesseract OCR on them. | |
Generate PDF, import the text content into the PDF's metadata. | |
""" | |
#!/usr/bin/env python | |
import codecs | |
import os | |
import re | |
from distutils.spawn import find_executable | |
LANG = "fin" | |
INPUT_EXTENSION = ".pdf" # tested with .png and .pdf | |
OUTPUT_APPENDIX = "_ocr" | |
OUTPUT_DPI = 300 | |
RUN_OCR = False | |
def handle_file(filepath): | |
""" produce metadata-filled PDF for given filename """ | |
output = "%s%s.pdf" % (filename, OUTPUT_APPENDIX) | |
pdf = filename + ".pdf" | |
png = filename + ".png" | |
txt = filename + ".txt" | |
# check if file is old output file | |
if (OUTPUT_APPENDIX + extension) in filepath: | |
return | |
# check if output file exists already | |
try: | |
with open(output): | |
return | |
except IOError: | |
pass | |
if RUN_OCR: | |
run_ocr(filepath) | |
import_metadata(filepath) | |
# remove temporary files | |
try: | |
os.remove(txt) | |
except OSError: | |
pass | |
try: | |
if extension == ".pdf": | |
os.remove(png) | |
else: | |
os.remove(pdf) | |
except OSError: | |
pass | |
# remove original file? if the OCR'd version exists | |
try: | |
with open(output): | |
os.remove(filepath) | |
except IOError: | |
pass | |
def run_ocr(filepath): | |
""" | |
takes a file (either PDF or image), runs it trough OCR and produces a .txt | |
""" | |
txt = filename + ".txt" | |
# check if the text file exists already | |
try: | |
with open(txt): | |
return | |
except IOError: | |
pass | |
# convert PDF to PNGs for OCR | |
if extension == ".pdf": | |
png = filename + ".png" | |
print "convert %s to %s-*" % (filepath, filename) | |
# ghostscript produces one PNG for page which are then combined with convert | |
os.system("gs -sDEVICE=pngalpha -sOutputFile=%s-%%03d.png -dLastPage=100\ | |
-r300 -dNOPAUSE -dBATCH -q %s -c quit" % (filename, filepath)) | |
os.system("convert -append %s-*.png %s" % (filename, png)) | |
os.system("rm %s-*.png" % filename) | |
ocr_input = png | |
else: | |
ocr_input = filepath | |
# run OCR on original file, result goes to text file | |
print "OCR %s to %s" % (ocr_input, txt) | |
# .txt is added to output filename by tesseract | |
os.system("tesseract -l %s %s %s" % (LANG, ocr_input, filename)) | |
def import_metadata(filepath): | |
""" | |
Reads a text file with same file name as input, generates PDF from input if | |
necessary, and imports the text into the PDF metadata | |
""" | |
txt = filename + ".txt" | |
pdf = filename + ".pdf" | |
output = filename + "%s.pdf" % OUTPUT_APPENDIX | |
try: | |
text_file = open(txt, "r+") | |
except IOError: | |
# text file not found | |
return | |
# if input is image, generate PDF | |
if extension != ".pdf": | |
print "convert %s to %s" % (filepath, pdf) | |
os.system("convert -density %s %s %s" % (OUTPUT_DPI, filepath, pdf)) | |
# Prepend OCR results with PDF meta data keys, write back to text file. | |
# Remove newlines and commas from text, so that whole text is one PDF | |
# metadata key / value | |
ocr_text = text_file.read() | |
text_file.close() | |
# split text into lines on every newline | |
ocr_text = ocr_text.replace(',', '\\,') | |
ocr_text = re.split(r"[\n\r]+", ocr_text) | |
# remove empty strings | |
ocr_text = [x.strip() for x in ocr_text] | |
ocr_text = filter(None, ocr_text) | |
# PDF metadata wants either ascii or UTF-16-BE | |
ocr_text = ", ".join(ocr_text) | |
ocr_text = to_unicode(ocr_text) | |
meta = "[ /Keywords %s\n /DOCINFO pdfmark" % ocr_text | |
text_file = open(txt, "w") | |
text_file.write(meta) | |
text_file.close() | |
# use PDF toolkit to import metadata from text file to PDF | |
print "update PDF metadata with %s" % txt | |
os.system("gs -q -dBATCH -dNOPAUSE -sDEVICE=pdfwrite -sOutputFile=%s %s %s"\ | |
% (output, pdf, txt)) | |
def to_unicode(string): | |
"""converts string into UTF-16-BE format which works in PDF metadata""" | |
string = string.decode('utf_8') | |
try: | |
ascii = string.encode('ascii') | |
except UnicodeEncodeError: | |
bytes = codecs.BOM_UTF16_BE + string.encode('utf_16_be') | |
return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in bytes)) | |
else: | |
# remove characters that are used as PDF metadata control characters | |
for a, b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'), ('\t', '\\t')]: | |
string = string.replace(a, b) | |
return '({})'.format(string) | |
# check if Tesseract, Imagemagick and Ghostscript exist | |
TESSERACT = find_executable("tesseract") | |
CONVERT = find_executable("convert") | |
GS = find_executable("gs") | |
if TESSERACT is None: | |
print "This script requires Tesseract, aborting. Install Tesseract with:\n\ | |
brew install tesseract --all-languages" | |
exit(1) | |
if CONVERT is None: | |
print "This script requires Imagemagick, aborting. Install Imagemagick with:\ | |
\nbrew install imagemagick" | |
exit(1) | |
if GS is None: | |
print "This script requires Ghostscript, aborting. Install Ghostscript\ | |
with:\nbrew install gs" | |
exit(1) | |
print "Looking for %ss..." % INPUT_EXTENSION | |
for f in os.listdir("."): | |
if not os.path.isfile(f): | |
continue | |
filename, extension = os.path.splitext(f) | |
if extension != INPUT_EXTENSION: | |
continue | |
handle_file(f) | |
exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment