Skip to content

Instantly share code, notes, and snippets.

@antipatico
Last active January 23, 2021 15:37
Show Gist options
  • Save antipatico/f48e0f4211d963c3297cbc33841b52d6 to your computer and use it in GitHub Desktop.
Save antipatico/f48e0f4211d963c3297cbc33841b52d6 to your computer and use it in GitHub Desktop.
Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
#!/bin/bash
# Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
#
# Hacked together using tips from these websites:
# http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/
# http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line
#
#
# Derived from: https://gist.github.com/jburon/d31e0132dfb291dc804bac019f9d9023
# Original: https://gist.github.com/wcaleb/7337097
#
# Author: antipatico (https://github.com/antipatico)
# Date: 23 Jan 2021
# Version: 0.2
# Dependencies: pdftk tesseract-ocr poppler-utils ghostscript parallel
if [ $# -lt 2 ]; then
echo "USAGE: $0 INPUT.pdf OUTPUT.pdf"
exit 1
fi
if ! which pdftoppm>/dev/null; then
echo "ERROR: poppler-utils is not installed." 2>&1
exit 1
fi
if ! which tesseract>/dev/null; then
echo "ERROR: tesseract-ocr is not installed." 2>&1
exit 1
fi
if ! which pdftk>/dev/null; then
echo "ERROR: pdftk is not installed." 2>&1
exit 1
fi
if ! which gs>/dev/null; then
echo "ERROR: ghostscript is not installed." 2>&1
exit 1
fi
if ! which parallel>/dev/null; then
echo "ERROR: GNU parallel is not installed." 2>&1
exit 1
fi
set -e
workdir=$(mktemp -d)
cp "$1" "$workdir/input.pdf"
pushd "$workdir" >/dev/null
echo "ocrpdf.sh version 0.2"
echo "Script created by antipatico (https://github.com/antipatico)"
echo ""
echo "Current work directory: \"$workdir\""
echo "Step 1/5: splitting the pdf into pages"
pdftk input.pdf burst output tesspage_%05d.pdf
echo "Step 2/5: converting each page into jpg"
parallel --bar 'pdftoppm -jpeg "{}" "{.}"' ::: tesspage_*.pdf
rm -f tesspage_*.pdf
echo "Step 3/5: ocr each image and convert back into pdf"
parallel --bar 'tesseract "{}" "ocrpdf_{.}" pdf >/dev/null 2>&1' ::: tesspage_*.jpg
echo "Step 4/5: merge ocr-ed pages back into a single pdf"
rm -f tesspage_*.jpg
pdftk ocrpdf_*.pdf cat output output.pdf
echo "Step 5/5: optimizing PDF for ebooks"
rm -f ocrpdf_*.pdf
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output-optimized.pdf output.pdf
popd >/dev/null
mv "$workdir/output-optimized.pdf" "$2"
rm -rf "$workdir"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment