Created
November 16, 2014 09:23
-
-
Save edvind/54c37fd2f585e984ce5e to your computer and use it in GitHub Desktop.
A small script for quick and dirty image to pdf conversion with ocr.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Bookbinder by Joel Bergroth | |
# A small script for quick and dirty image to pdf conversion with ocr. For example if your scanner produces multipaged tiffs and you want a small ocr'd pdf for reading on your tablet. | |
# | |
# Requirements: imagemagick, tesseract, scantailor, pdfbeads | |
# | |
# | |
# Simple usage: sh bookbinder [/path/to/folder] | |
# | |
# Works with black and white multipage tiffs stored in a folder. | |
# Example: sh bookbinder exampledit | |
# Example: sh bookbinder /home/username/exampledir | |
# Example: sh bookbinder exampledir swe | |
# | |
# Custom usage: sh bookbinder [/path/to/folder] [lang] [density] [color] | |
# Example: sh bookbinder exampledir swe 300 color | |
if [ -z "$1" ]; then | |
echo "No input directory." | |
exit 1 | |
fi | |
if [ -z "$2" ]; then | |
language=eng | |
else | |
language=$2 | |
fi | |
if [ -z "$3" ]; then | |
density="" | |
else | |
density="-density $3" | |
fi | |
if [ -z "$4" ]; then | |
splitformat=tif | |
colormode="" | |
else | |
splitformat=png | |
colormode="--color-mode=color_grayscale" | |
fi | |
cd $1 | |
mkdir split | |
echo "Splitting multipage image files." | |
convert $density * split/%05d.$splitformat | |
mkdir pages | |
echo "Running scans through scantailor." | |
scantailor-cli $colormode split/* pages | |
echo "Running OCR with tesseract using language: $language" | |
cd pages | |
for img in *.tif; do | |
filename=${img%%.*} | |
tesseract $img $filename -l $language hocr 2>> ../tesseract-log.txt | |
echo " Processed $img" | |
done | |
echo "Binding book using pdfbeads." | |
pdfbeads --bg-compression JPG --output ../../out.pdf *.tif | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment