Created
July 27, 2009 11:19
-
-
Save tariqadel/156151 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# This script turns ugly scanned images into plaintext | |
# | |
# @version 1.00 (ultra alpha) | |
# OCR program path; we assume tesseract, though. | |
OCR=tesseract | |
# Imagemagick | |
CONVERT=convert | |
# Do unpaper. This operation is costly, but can improve OCR. | |
UNPAPER=unpaper | |
UNPAPEROPTS=" --overwrite " | |
# Set this to 1 if you want more accurate OCR. | |
DOUNPAPER=0 | |
# EXT, for easy cleanup | |
EXT=".OCR" | |
# SUPER dangerous, make $EXT very unique! Set to 1 to enable | |
# Edit: functionality removed. | |
DOCLEANUP=0 | |
USAGE="Usage: ocr <outputfile> <filestobeconverted>" | |
ERR2="Unable to create output file." | |
# We need a minimum of two arguements | |
if [ $# -le 1 ]; then | |
echo $USAGE | |
exit 1 | |
fi | |
OUTPUTFILE=$1 | |
# Attempt to output some file. Run rm first to delete file if it exists, | |
# USERS BEWARE! | |
rm $OUTPUTFILE | |
touch $OUTPUTFILE | |
# We couldn't | |
if [ $? -ne 0 ]; then | |
echo $ERR2 | |
exit 2 | |
fi | |
for image in $*; do | |
# Skip the first arguement....so ugly, clean when time permits. | |
if [ $image == $OUTPUTFILE ]; then | |
continue; | |
fi | |
# Do unpaper step | |
if [ $DOUNPAPER == 1 ]; then | |
$CONVERT $image $image$EXT.pnm | |
$UNPAPER $UNPAPEROPTS $image$EXT.pnm $image$EXT.unpapered.pnm | |
# tif required for tesseract | |
$CONVERT $image$EXT.unpapered.pnm $image$EXT.tif | |
# Convert to tif | |
else | |
$CONVERT $image $image$EXT.tif | |
fi | |
$OCR $image$EXT.tif $image$EXT | |
# Make all new lines spaces. All lines end at the right of | |
# the page, so tesseract happily inserts a line break there | |
# which we don't want. Now our doument fits on one line, | |
# awesome | |
cat $image$EXT.txt | tr '\n' ' ' >> $OUTPUTFILE | |
echo "---" >> $OUTPUTFILE | |
done; | |
echo "Finished ripping strings from images. Do rm *.$EXT.* to cleanup." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment