Last active
April 1, 2021 08:45
-
-
Save cboulanger/043867872b874b2b9e3f6824792e2767 to your computer and use it in GitHub Desktop.
OCR of a scanned PDF, using imagemagick, ghostscript, poppler and tesseract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# builds on: | |
# https://ubuntuforums.org/showthread.php?t=1370827 | |
# Prerequisites: | |
# On Mac OS: | |
# brew install --with-libtiff --with-ghostscript imagemagick | |
# brew install --all-languages tesseract | |
# brew install poppler | |
if [[ -z "$1" ]] | |
then | |
echo "Usage: $0 files_to_OCR | |
Examples: | |
$0 file1 file2 ... | |
" | |
exit | |
fi | |
# choose language | |
read -p "Language of the PDFs [eng]: " LANG | |
LANG=${LANG:-eng} | |
if [ -z $(tesseract --list-langs | grep $LANG) ]; then | |
echo "Language with code '$LANG' is not supported" | |
exit 1 | |
fi | |
TMPDIR=$(dirname $(mktemp -u)) | |
for i in $* | |
do | |
# check file | |
if [[ ! -f "$i" ]] | |
then | |
echo "$i: file not found" >&2 | |
exit 1; | |
fi | |
# check if graphics file can be converted | |
if ! identify "$i" 2>/dev/null 1>&2 | |
then | |
echo "$i: not convertable" >&2 | |
exit 1; | |
fi | |
echo "Processing '$i'" >&2 | |
#get number of pages and perform conversion one page at a time | |
PAGES=$( pdfinfo $i | grep 'Pages' - | awk '{print $2}' ) | |
OUTPUT=$(echo $i| cut -d . -f1)-ocr.pdf | |
PREFIX=$TMPDIR/ocrpdfpage | |
for j in `seq 1 $PAGES`; do | |
echo "Page $j" | |
TMPFILE=$PREFIX$(printf %05d $j) | |
# converting the graphic file | |
echo ">>> converting to image" | |
convert -monochrome -density 600 $i\[$(($j - 1 ))\] $TMPFILE.tiff | |
# apply OCR on this file | |
echo ">>> applying OCR" | |
tesseract $TMPFILE.tiff -l $LANG $TMPFILE pdf >/dev/null 2>&1 # this suppresses stdout & stderr, comment out for debugging | |
# remove temporary files | |
rm $TMPFILE.tiff | |
done | |
# join temporary files | |
echo "Joining pages...." | |
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -sOutputFile=$OUTPUT $(ls $PREFIX*) | |
# alternative : pdftk page_*.pdf cat output merged.pdf | |
rm $PREFIX* | |
echo "$i completed" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Unfortunately, on line 24 "tesseract --list-langs" sends its output to STDERR, breaking the pipe. Change this line to:
tesseract --list-langs 2>&1 >/dev/null | grep $LANG
Otherwise, this was a very useful script. Thanks!