zoidyzoidzoid · October 3, 2020 10:27
diff --git a/pdf-ocr.sh b/pdf-ocr.sh
 #!/usr/bin/env bash
 # usage: ./pdf-ocr.sh scanned.pdf
 #
 # Download this file
 # Make it executable:
 #   chmod +x pdf-ocr.sh
 # Run it on your example file
 #   ./pdf-ocr.sh scanned.pdf
 set -euf -o pipefail -o xtrace

 INFILE=$1
 BASENAME=$(basename "$1" .pdf)
 TIFFFILE=$BASENAME.tiff
 OCRDPDFNOEXT=$BASENAME-OCRd-big
 OCRDPDF=$OCRDPDFNOEXT.pdf
 SMALLEROCRDPDF=$BASENAME-OCRd.pdf

 # Make a multipage TIFF of the original PDF ~700MB
 gs -o "$TIFFFILE" -sDEVICE=tiff32nc -r300 "$INFILE"
 # OCR the TIFF using tesseract4
 tesseract "$PWD/$TIFFFILE" "$PWD/$OCRDPDFNOEXT" pdf
 rm "$TIFFFILE"
 # Convert images in PDF to jpeg to reduce size ~4MB
 gs -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -sOutputFile="$SMALLEROCRDPDF" "$OCRDPDF"
	#!/usr/bin/env bash
	# usage: ./pdf-ocr.sh scanned.pdf
	#
	# Download this file
	# Make it executable:
	# chmod +x pdf-ocr.sh
	# Run it on your example file
	# ./pdf-ocr.sh scanned.pdf
	set -euf -o pipefail -o xtrace

	INFILE=$1
	BASENAME=$(basename "$1" .pdf)
	TIFFFILE=$BASENAME.tiff
	OCRDPDFNOEXT=$BASENAME-OCRd-big
	OCRDPDF=$OCRDPDFNOEXT.pdf
	SMALLEROCRDPDF=$BASENAME-OCRd.pdf

	# Make a multipage TIFF of the original PDF ~700MB
	gs -o "$TIFFFILE" -sDEVICE=tiff32nc -r300 "$INFILE"
	# OCR the TIFF using tesseract4
	tesseract "$PWD/$TIFFFILE" "$PWD/$OCRDPDFNOEXT" pdf
	rm "$TIFFFILE"
	# Convert images in PDF to jpeg to reduce size ~4MB
	gs -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -sOutputFile="$SMALLEROCRDPDF" "$OCRDPDF"