antipatico · January 23, 2021 15:37
diff --git a/ocrpdf.sh b/ocrpdf.sh
 #!/bin/bash

 # Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
 #
 # Hacked together using tips from these websites:
 #      http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
 #      http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
 #
 #
 # Derived from: https://gist.github.com/jburon/d31e0132dfb291dc804bac019f9d9023
 # Original: https://gist.github.com/wcaleb/7337097
 #
 # Author: antipatico (https://github.com/antipatico)
 # Date: 23 Jan 2021
 # Version: 0.2
 # Dependencies: pdftk tesseract-ocr poppler-utils ghostscript parallel

 if [ $# -lt 2 ]; then
 	echo "USAGE: $0 INPUT.pdf OUTPUT.pdf"
 	exit 1
 fi
 if ! which pdftoppm>/dev/null; then
 	echo "ERROR: poppler-utils is not installed." 2>&1
 	exit 1
 fi
 if ! which tesseract>/dev/null; then
 	echo "ERROR: tesseract-ocr is not installed." 2>&1
 	exit 1
 fi
 if ! which pdftk>/dev/null; then
 	echo "ERROR: pdftk is not installed." 2>&1
 	exit 1
 fi
 if ! which gs>/dev/null; then
    echo "ERROR: ghostscript is not installed." 2>&1
    exit 1
 fi
 if ! which parallel>/dev/null; then
 	echo "ERROR: GNU parallel is not installed." 2>&1
 	exit 1
 fi
 set -e
 workdir=$(mktemp -d)
 cp "$1" "$workdir/input.pdf"
 pushd "$workdir" >/dev/null
 echo "ocrpdf.sh version 0.2"
 echo "Script created by antipatico (https://github.com/antipatico)"
 echo ""
 echo "Current work directory: \"$workdir\""
 echo "Step 1/5: splitting the pdf into pages"
 pdftk input.pdf burst output tesspage_%05d.pdf
 echo "Step 2/5: converting each page into jpg"
 parallel --bar 'pdftoppm -jpeg "{}" "{.}"' ::: tesspage_*.pdf
 rm -f tesspage_*.pdf
 echo "Step 3/5: ocr each image and convert back into pdf"
 parallel --bar 'tesseract "{}" "ocrpdf_{.}" pdf >/dev/null 2>&1' ::: tesspage_*.jpg
 echo "Step 4/5: merge ocr-ed pages back into a single pdf"
 rm -f tesspage_*.jpg
 pdftk ocrpdf_*.pdf cat output output.pdf
 echo "Step 5/5: optimizing PDF for ebooks"
 rm -f ocrpdf_*.pdf
 gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output-optimized.pdf output.pdf
 popd >/dev/null
 mv "$workdir/output-optimized.pdf" "$2"
 rm -rf "$workdir"
	#!/bin/bash

	# Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
	#
	# Hacked together using tips from these websites:
	# http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
	# http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
	#
	#
	# Derived from: https://gist.github.com/jburon/d31e0132dfb291dc804bac019f9d9023
	# Original: https://gist.github.com/wcaleb/7337097
	#
	# Author: antipatico (https://github.com/antipatico)
	# Date: 23 Jan 2021
	# Version: 0.2
	# Dependencies: pdftk tesseract-ocr poppler-utils ghostscript parallel

	if [ $# -lt 2 ]; then
	echo "USAGE: $0 INPUT.pdf OUTPUT.pdf"
	exit 1
	fi
	if ! which pdftoppm>/dev/null; then
	echo "ERROR: poppler-utils is not installed." 2>&1
	exit 1
	fi
	if ! which tesseract>/dev/null; then
	echo "ERROR: tesseract-ocr is not installed." 2>&1
	exit 1
	fi
	if ! which pdftk>/dev/null; then
	echo "ERROR: pdftk is not installed." 2>&1
	exit 1
	fi
	if ! which gs>/dev/null; then
	echo "ERROR: ghostscript is not installed." 2>&1
	exit 1
	fi
	if ! which parallel>/dev/null; then
	echo "ERROR: GNU parallel is not installed." 2>&1
	exit 1
	fi
	set -e
	workdir=$(mktemp -d)
	cp "$1" "$workdir/input.pdf"
	pushd "$workdir" >/dev/null
	echo "ocrpdf.sh version 0.2"
	echo "Script created by antipatico (https://github.com/antipatico)"
	echo ""
	echo "Current work directory: \"$workdir\""
	echo "Step 1/5: splitting the pdf into pages"
	pdftk input.pdf burst output tesspage_%05d.pdf
	echo "Step 2/5: converting each page into jpg"
	parallel --bar 'pdftoppm -jpeg "{}" "{.}"' ::: tesspage_*.pdf
	rm -f tesspage_*.pdf
	echo "Step 3/5: ocr each image and convert back into pdf"
	parallel --bar 'tesseract "{}" "ocrpdf_{.}" pdf >/dev/null 2>&1' ::: tesspage_*.jpg
	echo "Step 4/5: merge ocr-ed pages back into a single pdf"
	rm -f tesspage_*.jpg
	pdftk ocrpdf_*.pdf cat output output.pdf
	echo "Step 5/5: optimizing PDF for ebooks"
	rm -f ocrpdf_*.pdf
	gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output-optimized.pdf output.pdf
	popd >/dev/null
	mv "$workdir/output-optimized.pdf" "$2"
	rm -rf "$workdir"