-
-
Save monkeycycle/ad15433fbb429f124d12 to your computer and use it in GitHub Desktop.
Script for the PDF OCR workflow described in https://ryanfb.github.io/etc/2014/11/13/command_line_ocr_on_mac_os_x.html. See also: https://github.com/fritz-hh/OCRmyPDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
PREFIX=$(basename "$1" .pdf) | |
if [ ! -z "$TESSERACT_FLAGS" ]; then | |
echo "Picked up TESSERACT_FLAGS: $TESSERACT_FLAGS" | |
fi | |
echo "Prefix is: $PREFIX" | |
echo "Converting to TIFF..." | |
if command -v parallel >/dev/null 2>&1; then | |
LAST_PAGE=$(($(pdfinfo "$1"|grep '^Pages:'|awk '{print $2}') - 1)) | |
parallel --bar -u convert -density 300 "$1"\[\{1\}\] -type Grayscale -compress lzw -background white +matte -depth 32 "${PREFIX}_page_%05d.tif" ::: $(seq 0 $LAST_PAGE) | |
else | |
convert -density 300 "$1" -type Grayscale -compress lzw -background white +matte -depth 32 "${PREFIX}_page_%05d.tif" | |
fi | |
echo "Performing OCR..." | |
if command -v parallel >/dev/null 2>&1; then | |
parallel --bar -u --retries 3 "tesseract $TESSERACT_FLAGS {} {.} pdf 2>/dev/null" ::: "${PREFIX}"_page_*.tif | |
else | |
for i in "${PREFIX}"_page_*.tif; do | |
echo $i | |
tesseract $TESSERACT_FLAGS "$i" "$(basename "$i" .tif)" pdf 2>/dev/null | |
done | |
fi | |
echo "Verifying output and converting missing files back without OCR..." | |
for i in "${PREFIX}"_page_*.pdf; do | |
if ! pdfinfo "$i" > /dev/null; then | |
rm -v $i | |
convert -density 300 $(basename $i .pdf).tif $i | |
fi | |
done | |
echo "Combining output to ${PREFIX}-OCR.pdf..." | |
if command -v pdftk >/dev/null 2>&1; then | |
pdftk "${PREFIX}"_page_*.pdf cat output "${PREFIX}-OCR.pdf" | |
else | |
gs -q -dNOPAUSE -dBATCH -dProvideUnicode -sDEVICE=pdfwrite -sOutputFile="${PREFIX}-OCR.pdf" "${PREFIX}"_page_*.pdf >/dev/null 2>&1 | |
fi | |
echo "Cleaning up..." | |
rm "${PREFIX}"_page_*.tif "${PREFIX}"_page_*.pdf "${PREFIX}"_page_*.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment