Skip to content

Instantly share code, notes, and snippets.

@stekhn
Created September 13, 2016 11:08
Show Gist options
  • Save stekhn/085a40c8286f1ab296472a910d7c4823 to your computer and use it in GitHub Desktop.
Save stekhn/085a40c8286f1ab296472a910d7c4823 to your computer and use it in GitHub Desktop.
Bash script to convert image PDFs to text using OCR. The script uses pdftk, imagemagick, ghostscript and tesseract-ocr.
#!/bin/sh
#needs: pdftk, imagemagick, ghostscript and tesseract-ocr
#pdf should be 300dpi or higher resolution
#set -o errexit
function success () {
echo "\033[33;32m ✓ \033[33;00m"
}
function failure () {
echo "\033[33;31m ✗"
echo "\033[33;31mError: "$1"" 1>&2
exit 1
}
LANG="eng"
if [ -z "$1" ]; then
echo "Opening file ...\c"
failure "No input file specified."
fi
if [ -z "$2" ]; then
echo "No language specified, using English as default"
else
LANG=$2
fi
echo "Splitting PDF file ...\c"
pdftk "$1" burst
if [ $? -eq 0 ]; then
success
else
failure "$?"
fi
echo "Converting PDF files to images ...\c"
trap "exit" INT
for i in pg*.pdf
do convert -units pixelsperinch -density 300x300 -colorspace Gray -depth 8 $i "`basename $i .foooo`.png"
done;
if [ $? -eq 0 ]; then
success
else
failure "$?"
fi
echo "Converting images to monochrome ...\c"
trap "exit" INT
for i in pg*.png
do convert $i +dither -monochrome -normalize "`basename $i .png`-m.png"
done;
if [ $? -eq 0 ]; then
success
else
failure "$?"
fi
echo "Character recognition ...\c"
trap "exit" INT
for i in *-m.png
#change language here, ex. -l eng
do tesseract $i "`basename $i .png`" -l $LANG quiet
done;
if [ $? -eq 0 ]; then
success
else
failure "$?"
fi
echo "Preparing text file ...\c"
DUMP=`basename "$1" .pdf`.txt;
touch $DUMP;
cat pg*.txt >> $DUMP;
if [ $? -eq 0 ]; then
success
else
failure "$?"
fi
echo "Cleaning up ...\c"
mkdir .trash;
mv pg* .trash;
rm -rf .trash
if [ $? -eq 0 ]; then
success
else
failure "$?"
fi
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment