Created
February 9, 2014 01:10
-
-
Save dllud/8892741 to your computer and use it in GitHub Desktop.
pdfocr - script to transform a PDF containing a scanned book into a searchable PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This is a script to transform a PDF containing a scanned book into a searchable PDF. | |
# Based on previous script and many good tips by Konrad Voelkel: | |
# http://blog.konradvoelkel.de/2010/01/linux-ocr-and-pdf-problem-solved/ | |
# http://blog.konradvoelkel.de/2013/03/scan-to-pdfa/ | |
# Depends on convert (ImageMagick), pdftk and hocr2pdf (ExactImage). | |
# $ sudo apt-get install imagemagick pdftk exactimage | |
# You also need at least one OCR software which can be either tesseract or cuneiform. | |
# $ sudo apt-get install tesseract-ocr | |
# $ sudo apt-get install cuneiform | |
# To install languages into tesseract do (e.g. for Portuguese): | |
# $ sudo apt-get install tesseract-ocr-por | |
echo "usage: ./pdfocr.sh document.pdf ocr-sfw split lang author title" | |
# where ocr-sfw is either tesseract or cuneiform | |
# split is either 0 (already single-paged) or 1 (2 book-pages per pdf-page) | |
# lang is a language as in "tesseract --list-langs" or "cuneiform -l". | |
# and author, title are used for the PDF metadata. | |
# | |
# usage example: | |
# ./pdfocr.sh SomeFile.pdf tesseract 1 por "Some Author" "Some Title" | |
pdftk "$1" burst dont_ask | |
for f in pg_*.pdf | |
do | |
if [ "1" == "$3" ]; then | |
convert -normalize -density 300 -depth 8 -crop 50%x100% +repage $f "$f.png" | |
else | |
convert -normalize -density 300 -depth 8 $f "$f.png" | |
fi | |
done | |
rm pg_*.pdf | |
for f in pg_*.png | |
do | |
if [ "tesseract" == "$2" ]; then | |
tesseract -l $4 -psm 1 $f $f hocr | |
elif [ "cuneiform" == "$2" ]; then | |
cuneiform -l $4 -f hocr -o "$f.html" $f | |
else | |
echo "$2 is not a valid OCR software." | |
fi | |
hocr2pdf -i $f -r 300 -s -o "$f.pdf" < "$f.html" | |
done | |
pdftk pg_*.pdf cat output merged.pdf | |
pdftk merged.pdf update_info_utf8 doc_data.txt output merged+data.pdf | |
echo "InfoBegin" > in.info | |
echo "InfoKey: Author" >> in.info | |
echo "InfoValue: $5" >> in.info | |
echo "InfoBegin" >> in.info | |
echo "InfoKey: Title" >> in.info | |
echo "InfoValue: $6" >> in.info | |
echo "InfoBegin" >> in.info | |
echo "InfoKey: Creator" >> in.info | |
echo "InfoValue: PDF OCR scan script" >> in.info | |
in_filename="${1%.*}" | |
pdftk merged+data.pdf update_info_utf8 in.info output "$in_filename-ocr.pdf" | |
rm -r doc_data.txt in.info merged* pg_* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment