lefth · November 3, 2021 15:04 · lefth · Nov 3, 2021
diff --git a/ocrpdf.sh b/ocrpdf.sh
 #!/bin/bash

 # NOTE: I recommend pdfsandwich instead of this script, partly because imagemagick (and pdftoppm) fail on large detailed images.
 # While that technique does not preserve the original graphics, it can come close.
 # To preserve color:
 # pdfsandwich -rgb input.pdf
 # To preserve grey tones:
 # pdfsandwich -gray input.pdf
 # To disable all preprocessing:
 # pdfsandwich -nopreproc input.pdf

 set -m # turn on job control for parallel processes

 # Source:
 # https://gist.github.com/wcaleb/7337097
 # https://gist.github.com/jburon/d31e0132dfb291dc804bac019f9d9023
 #
 # Changes:
 # - Don't delete files with wildcards. Always use a (random) prefix.
 # - Fix extensions of generated files.
 # - Don't use greyscale because it's not compatible with some versions of tesseract.
 # - Clean up all generated files afterwards.
 # - Keep the hocr2pdf command from another fork, but comment it out because it failed in my tests.
 # - Process pages in parallel instead of using multithreading in tesseract (which is less efficient).
 #   Override the job parallelism by setting THREAD_COUNT.

 # Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
 # Hacked together using tips from these websites:
 #      http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
 #      http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
 # Dependencies: pdftk, tesseract, imagemagick, enscript or hocr2pdf/pdfsandwich


 function process_page() {
 	local FILE=$1
 	echo "Processing $FILE"
 	local PAGE=$(basename "$FILE" .pdf)

 	# Convert the PDF page into a TIFF file
 	local IMG=$PAGE.tif
 	convert -density 600 "$FILE" "$IMG"

 	# OCR the TIFF file and save text to output.txt or output.hocr

 	OMP_THREAD_LIMIT=1 tesseract "$IMG" "${PAGE}_output"
 	# Turn text file outputed by tesseract into a PDF, then put it in background of original page
 	#enscript output.txt -B -o - | ps2pdf - output.pdf && pdftk $FILE background output.pdf output new-"$FILE"
 	enscript "${PAGE}_output.txt" -B -o - | ps2pdf - "${PAGE}_output.pdf" && pdftk "$FILE" background "${PAGE}_output.pdf" output "new-$FILE"

 	#tesseract "$IMG" "${PAGE}_output" hocr
 	## Turn html outputed by tesseract into a PDF, combined with the original image as foreground
 	#hocr2pdf -i "$IMG" -o "new-${FILE}" < "${PAGE}_output.hocr"

 	# Clean up
 	rm "$PAGE"*
 }

 function wait_jobs() {
 	while [[ $(jobs -r | wc -l) -gt $((${THREAD_COUNT:-$(nproc)} - 1)) ]]; do
 		sleep 0.25
 	done
 }

 if [[ $# -eq 0 || ! -e $1 ]]
 then
 	echo "Adds an OCR text layer to a PDF file to make searching easier."
 	echo "Usage: $0 <pdf file>"
 	exit
 fi

 TEMPNAME=$(mktemp -p . -u)
 TEMPNAME=${TEMPNAME/.\//} # remove "./"
 [[ -e $TEMPNAME ]] && echo "Could not create temp filenames" && exit

 cp $1 $1.bak
 pdftk $1 burst output "${TEMPNAME}_tesspage_%05d.pdf"
 for FILE in ${TEMPNAME}_tesspage*
 do
 	process_page "$FILE" &
 	wait_jobs
 done
 wait

 pdftk "new-${TEMPNAME}"* cat output $1

 # Clean up
 rm doc_data.txt "new-${TEMPNAME}"*
	#!/bin/bash

	# NOTE: I recommend pdfsandwich instead of this script, partly because imagemagick (and pdftoppm) fail on large detailed images.
	# While that technique does not preserve the original graphics, it can come close.
	# To preserve color:
	# pdfsandwich -rgb input.pdf
	# To preserve grey tones:
	# pdfsandwich -gray input.pdf
	# To disable all preprocessing:
	# pdfsandwich -nopreproc input.pdf

	set -m # turn on job control for parallel processes

	# Source:
	# https://gist.github.com/wcaleb/7337097
	# https://gist.github.com/jburon/d31e0132dfb291dc804bac019f9d9023
	#
	# Changes:
	# - Don't delete files with wildcards. Always use a (random) prefix.
	# - Fix extensions of generated files.
	# - Don't use greyscale because it's not compatible with some versions of tesseract.
	# - Clean up all generated files afterwards.
	# - Keep the hocr2pdf command from another fork, but comment it out because it failed in my tests.
	# - Process pages in parallel instead of using multithreading in tesseract (which is less efficient).
	# Override the job parallelism by setting THREAD_COUNT.

	# Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
	# Hacked together using tips from these websites:
	# http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
	# http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
	# Dependencies: pdftk, tesseract, imagemagick, enscript or hocr2pdf/pdfsandwich


	function process_page() {
	local FILE=$1
	echo "Processing $FILE"
	local PAGE=$(basename "$FILE" .pdf)

	# Convert the PDF page into a TIFF file
	local IMG=$PAGE.tif
	convert -density 600 "$FILE" "$IMG"

	# OCR the TIFF file and save text to output.txt or output.hocr

	OMP_THREAD_LIMIT=1 tesseract "$IMG" "${PAGE}_output"
	# Turn text file outputed by tesseract into a PDF, then put it in background of original page
	#enscript output.txt -B -o - \| ps2pdf - output.pdf && pdftk $FILE background output.pdf output new-"$FILE"
	enscript "${PAGE}_output.txt" -B -o - \| ps2pdf - "${PAGE}_output.pdf" && pdftk "$FILE" background "${PAGE}_output.pdf" output "new-$FILE"

	#tesseract "$IMG" "${PAGE}_output" hocr
	## Turn html outputed by tesseract into a PDF, combined with the original image as foreground
	#hocr2pdf -i "$IMG" -o "new-${FILE}" < "${PAGE}_output.hocr"

	# Clean up
	rm "$PAGE"*
	}

	function wait_jobs() {
	while [[ $(jobs -r \| wc -l) -gt $((${THREAD_COUNT:-$(nproc)} - 1)) ]]; do
	sleep 0.25
	done
	}

	if [[ $# -eq 0 \|\| ! -e $1 ]]
	then
	echo "Adds an OCR text layer to a PDF file to make searching easier."
	echo "Usage: $0 <pdf file>"
	exit
	fi

	TEMPNAME=$(mktemp -p . -u)
	TEMPNAME=${TEMPNAME/.\//} # remove "./"
	[[ -e $TEMPNAME ]] && echo "Could not create temp filenames" && exit

	cp $1 $1.bak
	pdftk $1 burst output "${TEMPNAME}_tesspage_%05d.pdf"
	for FILE in ${TEMPNAME}_tesspage*
	do
	process_page "$FILE" &
	wait_jobs
	done
	wait

	pdftk "new-${TEMPNAME}"* cat output $1

	# Clean up
	rm doc_data.txt "new-${TEMPNAME}"*