dllud · February 9, 2014 01:10
diff --git a/pdfocr b/pdfocr
 #!/bin/bash

 # This is a script to transform a PDF containing a scanned book into a searchable PDF.
 # Based on previous script and many good tips by Konrad Voelkel:
 # http://blog.konradvoelkel.de/2010/01/linux-ocr-and-pdf-problem-solved/
 # http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/
 # Depends on convert (ImageMagick), pdftk and hocr2pdf (ExactImage).
 # $ sudo apt-get install imagemagick pdftk exactimage
 # You also need at least one OCR software which can be either tesseract or cuneiform.
 # $ sudo apt-get install tesseract-ocr
 # $ sudo apt-get install cuneiform
 # To install languages into tesseract do (e.g. for Portuguese):
 # $ sudo apt-get install tesseract-ocr-por

 echo "usage: ./pdfocr.sh document.pdf ocr-sfw split lang author title"
 # where ocr-sfw is either tesseract or cuneiform
 # split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
 # lang is a language as in "tesseract --list-langs" or "cuneiform -l".
 # and author, title are used for the PDF metadata.
 #
 # usage example:
 # ./pdfocr.sh SomeFile.pdf tesseract 1 por "Some Author" "Some Title"
 pdftk "$1" burst dont_ask
 for f in pg_*.pdf
 do
 	if [ "1" == "$3" ]; then
 		convert -normalize -density 300 -depth 8 -crop 50%x100% +repage $f "$f.png"
 	else
 		convert -normalize -density 300 -depth 8 $f "$f.png"
 	fi
 done
 rm pg_*.pdf

 for f in pg_*.png
 do
 	if [ "tesseract" == "$2" ]; then
 		tesseract -l $4 -psm 1 $f $f hocr
 	elif [ "cuneiform" == "$2" ]; then
 		cuneiform -l $4 -f hocr -o "$f.html" $f
 	else
 		echo "$2 is not a valid OCR software."
 	fi
 	hocr2pdf -i $f -r 300 -s -o "$f.pdf" < "$f.html"
 done

 pdftk pg_*.pdf cat output merged.pdf

 pdftk merged.pdf update_info_utf8 doc_data.txt output merged+data.pdf
 echo "InfoBegin" > in.info
 echo "InfoKey: Author" >> in.info
 echo "InfoValue: $5" >> in.info
 echo "InfoBegin" >> in.info
 echo "InfoKey: Title" >> in.info
 echo "InfoValue: $6" >> in.info
 echo "InfoBegin" >> in.info
 echo "InfoKey: Creator" >> in.info
 echo "InfoValue: PDF OCR scan script" >> in.info
 in_filename="${1%.*}"
 pdftk merged+data.pdf update_info_utf8 in.info output "$in_filename-ocr.pdf"

 rm -r doc_data.txt in.info merged* pg_*
	#!/bin/bash

	# This is a script to transform a PDF containing a scanned book into a searchable PDF.
	# Based on previous script and many good tips by Konrad Voelkel:
	# http://blog.konradvoelkel.de/2010/01/linux-ocr-and-pdf-problem-solved/
	# http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/
	# Depends on convert (ImageMagick), pdftk and hocr2pdf (ExactImage).
	# $ sudo apt-get install imagemagick pdftk exactimage
	# You also need at least one OCR software which can be either tesseract or cuneiform.
	# $ sudo apt-get install tesseract-ocr
	# $ sudo apt-get install cuneiform
	# To install languages into tesseract do (e.g. for Portuguese):
	# $ sudo apt-get install tesseract-ocr-por

	echo "usage: ./pdfocr.sh document.pdf ocr-sfw split lang author title"
	# where ocr-sfw is either tesseract or cuneiform
	# split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page)
	# lang is a language as in "tesseract --list-langs" or "cuneiform -l".
	# and author, title are used for the PDF metadata.
	#
	# usage example:
	# ./pdfocr.sh SomeFile.pdf tesseract 1 por "Some Author" "Some Title"
	pdftk "$1" burst dont_ask
	for f in pg_*.pdf
	do
	if [ "1" == "$3" ]; then
	convert -normalize -density 300 -depth 8 -crop 50%x100% +repage $f "$f.png"
	else
	convert -normalize -density 300 -depth 8 $f "$f.png"
	fi
	done
	rm pg_*.pdf

	for f in pg_*.png
	do
	if [ "tesseract" == "$2" ]; then
	tesseract -l $4 -psm 1 $f $f hocr
	elif [ "cuneiform" == "$2" ]; then
	cuneiform -l $4 -f hocr -o "$f.html" $f
	else
	echo "$2 is not a valid OCR software."
	fi
	hocr2pdf -i $f -r 300 -s -o "$f.pdf" < "$f.html"
	done

	pdftk pg_*.pdf cat output merged.pdf

	pdftk merged.pdf update_info_utf8 doc_data.txt output merged+data.pdf
	echo "InfoBegin" > in.info
	echo "InfoKey: Author" >> in.info
	echo "InfoValue: $5" >> in.info
	echo "InfoBegin" >> in.info
	echo "InfoKey: Title" >> in.info
	echo "InfoValue: $6" >> in.info
	echo "InfoBegin" >> in.info
	echo "InfoKey: Creator" >> in.info
	echo "InfoValue: PDF OCR scan script" >> in.info
	in_filename="${1%.*}"
	pdftk merged+data.pdf update_info_utf8 in.info output "$in_filename-ocr.pdf"

	rm -r doc_data.txt in.info merged* pg_*