Created
February 16, 2017 12:02
-
-
Save stesie/42dff3d14fbfac60524f381babb8f81d to your computer and use it in GitHub Desktop.
Shell script to scan pdf, tesseract (ocr) it and create pdf with down-sampled image with text overlay
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -e | |
## SCAN settings | |
FORMAT="-l 0 -t 0 -x 210 -y 297" | |
MODE=color | |
RESOLUTION=300 | |
LANG=deu | |
postprocess_scan() { | |
TMPFILE="$1"; shift | |
if [ -e "$HOME/.tesseract/user-words" ]; then | |
TESSUW="--user-words $HOME/.tesseract/user-words" | |
fi | |
# tesseract unfortunately has no --quiet option ... | |
tesseract $TESSUW -l "$LANG" "$TMPFILE".tif "$TMPFILE" pdf 2>&1 | grep -v \ | |
-e "Tesseract Open Source OCR Engine" \ | |
-e "^Page" \ | |
-e "Warning in pixReadMemTiff" | |
# next step: remove image from pdf | |
# | |
# -> image payload always stored in object 11 (remove it) | |
# -> remove XObject from document | |
sed -e "/\/XObject << \/Im1 11 0 R >>/d" \ | |
-e "/^11 0 obj/,/^endobj/d" \ | |
"$TMPFILE".pdf | csplit --quiet --prefix="$TMPFILE" - '/^10 0 obj/' '/^stream/+1' '/^endobj/' '/^xref/' '/^trailer/' '/^startxref/' | |
# csplit generated following parts now: | |
# -> 00 : pdf start part | |
# -> 01 : object 10 header (= document content) | |
# -> 02 : zlib deflated content of object 10 | |
# -> 03 : everything beyond object 10 (including the "endobj" string of object 10) | |
# -> 04 : xref table | |
# -> 05 : trailer | |
# -> 06 : startxref pointer + %%EOF | |
# remove image reference from object 10 | |
(echo "stream"; zlib-flate -uncompress < "$TMPFILE"02 | grep -ve ^q; echo "endstream") > "$TMPFILE"02.patched | |
# regenerate object 10 header | |
cat > "$TMPFILE"01.patched <<EOF | |
10 0 obj | |
<< | |
/Length `stat -c %s "$TMPFILE"02.patched` | |
>> | |
EOF | |
# re-pack pdf, leaving away the xref table | |
(cat "$TMPFILE"{00,01.patched,02.patched,03,05}; echo "%%EOF") > "$TMPFILE"-overlay.pdf | |
# down-sample scanned image + convert to pdf | |
convert "$TMPFILE".tif -resize $[150 * 100 / $RESOLUTION]% "$TMPFILE".scaled.tif | |
tiff2pdf -zj "$TMPFILE".scaled.tif -o "$TMPFILE".scaled.pdf | |
# combine text layer with down-sampled image | |
pdftk "$TMPFILE"-overlay.pdf background "$TMPFILE".scaled.pdf output "$TMPFILE".pdf | |
rm "$TMPFILE".{tif,scaled.tif,scaled.pdf} "$TMPFILE"{,00,01,01.patched,02,02.patched,03,04,05,06,-overlay.pdf} | |
} | |
PAGE=0 | |
while [ "$userinput" != "q" ]; do | |
PAGEFILE=$(mktemp) | |
PAGE=$[$PAGE + 1] | |
echo "" | |
echo "Scanning page $PAGE ..." | |
scanimage --format tiff --mode "$MODE" $FORMAT --resolution "$RESOLUTION" -p > "$PAGEFILE".tif | |
postprocess_scan "$PAGEFILE" & | |
PARTS="$PARTS $PAGEFILE.pdf" | |
echo "" | |
echo "" | |
echo "Scan complete. Insert next sheet and hit RET to scan another page." | |
echo "Type 'q RET' to exit scanning" | |
read userinput | |
done | |
echo "" | |
echo "Waiting for child jobs to complete ..." | |
set +e | |
for job in `jobs -p`; do wait $job; done | |
set -e | |
echo "" | |
echo "Child jobs finished, bundling PDFs now ..." | |
pdftk $PARTS cat output result.pdf | |
rm $PARTS | |
echo "" | |
echo "Final PDF document stored to : result.pdf" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment