Created
September 13, 2016 11:08
-
-
Save stekhn/085a40c8286f1ab296472a910d7c4823 to your computer and use it in GitHub Desktop.
Bash script to convert image PDFs to text using OCR. The script uses pdftk, imagemagick, ghostscript and tesseract-ocr.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
#needs: pdftk, imagemagick, ghostscript and tesseract-ocr | |
#pdf should be 300dpi or higher resolution | |
#set -o errexit | |
function success () { | |
echo "\033[33;32m ✓ \033[33;00m" | |
} | |
function failure () { | |
echo "\033[33;31m ✗" | |
echo "\033[33;31mError: "$1"" 1>&2 | |
exit 1 | |
} | |
LANG="eng" | |
if [ -z "$1" ]; then | |
echo "Opening file ...\c" | |
failure "No input file specified." | |
fi | |
if [ -z "$2" ]; then | |
echo "No language specified, using English as default" | |
else | |
LANG=$2 | |
fi | |
echo "Splitting PDF file ...\c" | |
pdftk "$1" burst | |
if [ $? -eq 0 ]; then | |
success | |
else | |
failure "$?" | |
fi | |
echo "Converting PDF files to images ...\c" | |
trap "exit" INT | |
for i in pg*.pdf | |
do convert -units pixelsperinch -density 300x300 -colorspace Gray -depth 8 $i "`basename $i .foooo`.png" | |
done; | |
if [ $? -eq 0 ]; then | |
success | |
else | |
failure "$?" | |
fi | |
echo "Converting images to monochrome ...\c" | |
trap "exit" INT | |
for i in pg*.png | |
do convert $i +dither -monochrome -normalize "`basename $i .png`-m.png" | |
done; | |
if [ $? -eq 0 ]; then | |
success | |
else | |
failure "$?" | |
fi | |
echo "Character recognition ...\c" | |
trap "exit" INT | |
for i in *-m.png | |
#change language here, ex. -l eng | |
do tesseract $i "`basename $i .png`" -l $LANG quiet | |
done; | |
if [ $? -eq 0 ]; then | |
success | |
else | |
failure "$?" | |
fi | |
echo "Preparing text file ...\c" | |
DUMP=`basename "$1" .pdf`.txt; | |
touch $DUMP; | |
cat pg*.txt >> $DUMP; | |
if [ $? -eq 0 ]; then | |
success | |
else | |
failure "$?" | |
fi | |
echo "Cleaning up ...\c" | |
mkdir .trash; | |
mv pg* .trash; | |
rm -rf .trash | |
if [ $? -eq 0 ]; then | |
success | |
else | |
failure "$?" | |
fi | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment