cboulanger · April 1, 2021 08:45 · GlennBell · Dec 18, 2018
diff --git a/pdfocr.sh b/pdfocr.sh
 #!/bin/bash

 # builds on: 
 # https://ubuntuforums.org/showthread.php?t=1370827

 # Prerequisites:
 # On Mac OS:
 # brew install --with-libtiff --with-ghostscript imagemagick
 # brew install --all-languages tesseract
 # brew install poppler

 if [[ -z "$1" ]]
 then
 	echo "Usage: $0 files_to_OCR
 	Examples:
 	$0 file1 file2 ...
 	"
 	exit
 fi

 # choose language
 read -p "Language of the PDFs [eng]: " LANG
 LANG=${LANG:-eng}
 if [ -z $(tesseract --list-langs | grep $LANG) ]; then
 	echo "Language with code '$LANG' is not supported"
 	exit 1
 fi


 TMPDIR=$(dirname $(mktemp -u))

 for i in $*
 do
 	# check file
 	if [[ ! -f "$i" ]]
 	then
 		echo "$i: file not found" >&2
 		exit 1;
 	fi
 	
 	# check if graphics file can be converted
 	if ! identify "$i" 2>/dev/null 1>&2	
 	then
 		echo "$i: not convertable" >&2
 		exit 1;
 	fi

 	echo "Processing '$i'" >&2
 	
 	#get number of pages and perform conversion one page at a time
 	PAGES=$( pdfinfo $i | grep 'Pages' - | awk '{print $2}' )
 	OUTPUT=$(echo $i| cut -d . -f1)-ocr.pdf
 	PREFIX=$TMPDIR/ocrpdfpage
 	for j in `seq 1 $PAGES`; do
 		echo "Page $j"
 		TMPFILE=$PREFIX$(printf %05d $j)
 		# converting the graphic file
 		echo ">>> converting to image"
 		convert -monochrome -density 600 $i\[$(($j - 1 ))\] $TMPFILE.tiff
 		# apply OCR on this file
 		echo ">>> applying OCR"
 		tesseract $TMPFILE.tiff -l $LANG $TMPFILE pdf >/dev/null 2>&1 # this suppresses stdout & stderr, comment out for debugging
 		# remove temporary files
 		rm $TMPFILE.tiff
 	done
 	# join temporary files
 	echo "Joining pages...."
 	gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -sOutputFile=$OUTPUT $(ls $PREFIX*)
 	# alternative : pdftk page_*.pdf cat output merged.pdf
 	rm $PREFIX*
 	echo "$i completed"
 done
	#!/bin/bash

	# builds on:
	# https://ubuntuforums.org/showthread.php?t=1370827

	# Prerequisites:
	# On Mac OS:
	# brew install --with-libtiff --with-ghostscript imagemagick
	# brew install --all-languages tesseract
	# brew install poppler

	if [[ -z "$1" ]]
	then
	echo "Usage: $0 files_to_OCR
	Examples:
	$0 file1 file2 ...
	"
	exit
	fi

	# choose language
	read -p "Language of the PDFs [eng]: " LANG
	LANG=${LANG:-eng}
	if [ -z $(tesseract --list-langs \| grep $LANG) ]; then
	echo "Language with code '$LANG' is not supported"
	exit 1
	fi


	TMPDIR=$(dirname $(mktemp -u))

	for i in $*
	do
	# check file
	if [[ ! -f "$i" ]]
	then
	echo "$i: file not found" >&2
	exit 1;
	fi

	# check if graphics file can be converted
	if ! identify "$i" 2>/dev/null 1>&2
	then
	echo "$i: not convertable" >&2
	exit 1;
	fi

	echo "Processing '$i'" >&2

	#get number of pages and perform conversion one page at a time
	PAGES=$( pdfinfo $i \| grep 'Pages' - \| awk '{print $2}' )
	OUTPUT=$(echo $i\| cut -d . -f1)-ocr.pdf
	PREFIX=$TMPDIR/ocrpdfpage
	for j in `seq 1 $PAGES`; do
	echo "Page $j"
	TMPFILE=$PREFIX$(printf %05d $j)
	# converting the graphic file
	echo ">>> converting to image"
	convert -monochrome -density 600 $i\[$(($j - 1 ))\] $TMPFILE.tiff
	# apply OCR on this file
	echo ">>> applying OCR"
	tesseract $TMPFILE.tiff -l $LANG $TMPFILE pdf >/dev/null 2>&1 # this suppresses stdout & stderr, comment out for debugging
	# remove temporary files
	rm $TMPFILE.tiff
	done
	# join temporary files
	echo "Joining pages...."
	gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dPDFSETTINGS=/ebook -sOutputFile=$OUTPUT $(ls $PREFIX*)
	# alternative : pdftk page_*.pdf cat output merged.pdf
	rm $PREFIX*
	echo "$i completed"
	done