Last active
February 14, 2024 08:27
-
-
Save kueda/c02b9f3f5a0f03f41524 to your computer and use it in GitHub Desktop.
OS X bash script that turns a collection of images into an OCR'd PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# img2pdf | |
# | |
# OS X bash script that turns a collection of images into an OCR'd PDF | |
# | |
# Adapted from http://apple.stackexchange.com/questions/128384/ocr-on-pdfs-in-os-x-with-free-open-source-tools, | |
# where it was in turn adapted from | |
# http://www.morethantechnical.com/2013/11/21/creating-a-searchable-pdf-with-opensource-tools-ghostscript-hocr2pdf-and-tesseract-ocr/ | |
# from http://www.ehow.com/how_6874571_merge-pdf-files-ghostscript.html | |
# bash tut: http://linuxconfig.org/bash-scripting-tutorial | |
# Linux PDF,OCR: http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/ | |
# Dealing w/ alpha: http://stackoverflow.com/questions/5083492/problem-with-tesseract-and-tiff-format | |
# | |
# Install | |
# brew install tesseract --HEAD | |
# brew install imagemagick | |
# brew install ghostscript | |
# chmod +x img2pdf | |
# | |
# Usage | |
# ./img2pdf *.gif | |
# | |
# If you have a mix of extensions: | |
# ./img2pdf *.{gif,jpeg} | |
# | |
y="`pwd`/$1" | |
echo Creating a searchable PDF for $y | |
x=`basename "$y"` | |
name=${x%.*} | |
# process each page | |
for f in $@; do | |
echo $f | |
echo "\tConverting to TIFF..." | |
convert $f -background white -flatten +matte $f.tiff | |
echo "\tTesseract OCR..." | |
# echo "tesseract -l eng -psm 3 $f.tiff ${f%.*} pdf" | |
tesseract -l eng -psm 3 $f.tiff ${f%.*} pdf 1>/dev/null 2>&1 | |
echo "\tCleanup..." | |
rm $f.tiff | |
rm ${f%.*}.txt | |
mv ${f%.*}.pdf $f.tmp.pdf | |
done | |
echo "Combining all pages into a single PDF..." | |
gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=${name}.searchable.pdf *.tmp.pdf | |
rm *.tmp.pdf | |
echo "Created $name.searchable.pdf" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment