scruss · April 11, 2017 20:37
diff --git a/dwim-ocr.sh b/dwim-ocr.sh
 #!/bin/bash
 # dwim-ocr.sh - ocr a pdf document to current directory as mono bitmap
 # created by scruss on Fri 11 May 2012 19:56:03 EDT
 # $Id: dwim-ocr.sh,v 1.5 2017/03/28 20:58:39 scruss Exp $

 set -eu
 input="$1"
 # uses $2 if specified, sensible default otherwise
 output=${2:-${1%\.*}-ocr.pdf}
 # but check that the output isn't a bare file or pdfbeads will choke
 if
    [ "$output" == "$(basename "$output")" ]
 then
    output="${PWD}/$output"
 fi
 echo "Writing to ${output} ..."
 tmpdir="$(mktemp -d /tmp/dwim-ocr.XXXXXXX)" || exit 1

 # extract images of the pages (note: resolution hard-coded)
 # FIXME at some point; use pdftoppm?
 gs -SDEVICE=tiffg4 -r360x360 -sOutputFile="$tmpdir/page-%04d.tif" -dNOPAUSE -dBATCH -- "$input"

 # OCR each page in parallel and convert into PDF
 pushd "$tmpdir"
 ls -1 page-*.tif | parallel --no-notice --gnu tesseract {} {.} hocr

 # combine the pages into one PDF
 pdfbeads -o "$output" *

 popd
 rm -rf -- "$tmpdir"

 # check if job failed
 if
    [ ! -s "$output" ]
 then
    rm -f "$output"
    echo "$output" failed.
    exit 1
 fi
	#!/bin/bash
	# dwim-ocr.sh - ocr a pdf document to current directory as mono bitmap
	# created by scruss on Fri 11 May 2012 19:56:03 EDT
	# $Id: dwim-ocr.sh,v 1.5 2017/03/28 20:58:39 scruss Exp $

	set -eu
	input="$1"
	# uses $2 if specified, sensible default otherwise
	output=${2:-${1%\.*}-ocr.pdf}
	# but check that the output isn't a bare file or pdfbeads will choke
	if
	[ "$output" == "$(basename "$output")" ]
	then
	output="${PWD}/$output"
	fi
	echo "Writing to ${output} ..."
	tmpdir="$(mktemp -d /tmp/dwim-ocr.XXXXXXX)" \|\| exit 1

	# extract images of the pages (note: resolution hard-coded)
	# FIXME at some point; use pdftoppm?
	gs -SDEVICE=tiffg4 -r360x360 -sOutputFile="$tmpdir/page-%04d.tif" -dNOPAUSE -dBATCH -- "$input"

	# OCR each page in parallel and convert into PDF
	pushd "$tmpdir"
	ls -1 page-*.tif \| parallel --no-notice --gnu tesseract {} {.} hocr

	# combine the pages into one PDF
	pdfbeads -o "$output" *

	popd
	rm -rf -- "$tmpdir"

	# check if job failed
	if
	[ ! -s "$output" ]
	then
	rm -f "$output"
	echo "$output" failed.
	exit 1
	fi