Last active
January 23, 2021 15:37
-
-
Save antipatico/f48e0f4211d963c3297cbc33841b52d6 to your computer and use it in GitHub Desktop.
Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Take a PDF, OCR it, and add OCR Text as background layer to original PDF to make it searchable. | |
# | |
# Hacked together using tips from these websites: | |
# http://www.jlaundry.com/2012/ocr-a-scanned-pdf-with-tesseract/ | |
# http://askubuntu.com/questions/27097/how-to-print-a-regular-file-to-pdf-from-command-line | |
# | |
# | |
# Derived from: https://gist.github.com/jburon/d31e0132dfb291dc804bac019f9d9023 | |
# Original: https://gist.github.com/wcaleb/7337097 | |
# | |
# Author: antipatico (https://github.com/antipatico) | |
# Date: 23 Jan 2021 | |
# Version: 0.2 | |
# Dependencies: pdftk tesseract-ocr poppler-utils ghostscript parallel | |
if [ $# -lt 2 ]; then | |
echo "USAGE: $0 INPUT.pdf OUTPUT.pdf" | |
exit 1 | |
fi | |
if ! which pdftoppm>/dev/null; then | |
echo "ERROR: poppler-utils is not installed." 2>&1 | |
exit 1 | |
fi | |
if ! which tesseract>/dev/null; then | |
echo "ERROR: tesseract-ocr is not installed." 2>&1 | |
exit 1 | |
fi | |
if ! which pdftk>/dev/null; then | |
echo "ERROR: pdftk is not installed." 2>&1 | |
exit 1 | |
fi | |
if ! which gs>/dev/null; then | |
echo "ERROR: ghostscript is not installed." 2>&1 | |
exit 1 | |
fi | |
if ! which parallel>/dev/null; then | |
echo "ERROR: GNU parallel is not installed." 2>&1 | |
exit 1 | |
fi | |
set -e | |
workdir=$(mktemp -d) | |
cp "$1" "$workdir/input.pdf" | |
pushd "$workdir" >/dev/null | |
echo "ocrpdf.sh version 0.2" | |
echo "Script created by antipatico (https://github.com/antipatico)" | |
echo "" | |
echo "Current work directory: \"$workdir\"" | |
echo "Step 1/5: splitting the pdf into pages" | |
pdftk input.pdf burst output tesspage_%05d.pdf | |
echo "Step 2/5: converting each page into jpg" | |
parallel --bar 'pdftoppm -jpeg "{}" "{.}"' ::: tesspage_*.pdf | |
rm -f tesspage_*.pdf | |
echo "Step 3/5: ocr each image and convert back into pdf" | |
parallel --bar 'tesseract "{}" "ocrpdf_{.}" pdf >/dev/null 2>&1' ::: tesspage_*.jpg | |
echo "Step 4/5: merge ocr-ed pages back into a single pdf" | |
rm -f tesspage_*.jpg | |
pdftk ocrpdf_*.pdf cat output output.pdf | |
echo "Step 5/5: optimizing PDF for ebooks" | |
rm -f ocrpdf_*.pdf | |
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/ebook -dNOPAUSE -dQUIET -dBATCH -sOutputFile=output-optimized.pdf output.pdf | |
popd >/dev/null | |
mv "$workdir/output-optimized.pdf" "$2" | |
rm -rf "$workdir" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment