Created
April 11, 2017 20:37
-
-
Save scruss/1878f738b5f72cb8f6a13c9cb46b076f to your computer and use it in GitHub Desktop.
dwim-ocr.sh - ocr a pdf document to current directory as mono bitmap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# dwim-ocr.sh - ocr a pdf document to current directory as mono bitmap | |
# created by scruss on Fri 11 May 2012 19:56:03 EDT | |
# $Id: dwim-ocr.sh,v 1.5 2017/03/28 20:58:39 scruss Exp $ | |
set -eu | |
input="$1" | |
# uses $2 if specified, sensible default otherwise | |
output=${2:-${1%\.*}-ocr.pdf} | |
# but check that the output isn't a bare file or pdfbeads will choke | |
if | |
[ "$output" == "$(basename "$output")" ] | |
then | |
output="${PWD}/$output" | |
fi | |
echo "Writing to ${output} ..." | |
tmpdir="$(mktemp -d /tmp/dwim-ocr.XXXXXXX)" || exit 1 | |
# extract images of the pages (note: resolution hard-coded) | |
# FIXME at some point; use pdftoppm? | |
gs -SDEVICE=tiffg4 -r360x360 -sOutputFile="$tmpdir/page-%04d.tif" -dNOPAUSE -dBATCH -- "$input" | |
# OCR each page in parallel and convert into PDF | |
pushd "$tmpdir" | |
ls -1 page-*.tif | parallel --no-notice --gnu tesseract {} {.} hocr | |
# combine the pages into one PDF | |
pdfbeads -o "$output" * | |
popd | |
rm -rf -- "$tmpdir" | |
# check if job failed | |
if | |
[ ! -s "$output" ] | |
then | |
rm -f "$output" | |
echo "$output" failed. | |
exit 1 | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment