Last active
May 22, 2018 14:14
-
-
Save jpstroop/f15ac30dd138d7337b68292f98d828d3 to your computer and use it in GitHub Desktop.
Guess the orientation of an image using OCR and Spellcheck
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script takes a single argument, which is a path in an image file. | |
# NOTE: this file will be replaced with the version that this script deems to be 'correct' | |
file=$1 | |
TMP="/tmp/pulfa/img_harvester/rotation-calc" | |
# Clean up if there are files from the last run | |
# (leaving them around is handy for debugging) | |
if [ -d $TMP ]; then | |
rm -r $TMP | |
fi | |
mkdir $TMP | |
# Dependencies: | |
# convert: apt-get install imagemagick | |
# ocrad: apt-get install ocrad | |
ASPELL="/usr/bin/aspell" | |
AWK="/usr/bin/awk" | |
BASENAME="/usr/bin/basename" | |
CONVERT="/usr/bin/convert" | |
DIRNAME="/usr/bin/dirname" | |
HEAD="/usr/bin/head" | |
OCRAD="/usr/bin/ocrad" | |
SORT="/usr/bin/sort" | |
WC="/usr/bin/wc" | |
# Make 90 degree variants of the input file. The input file is north | |
file_name=$(basename $file) | |
north_file="$TMP/$file_name-north" | |
east_file="$TMP/$file_name-east" | |
south_file="$TMP/$file_name-south" | |
west_file="$TMP/$file_name-west" | |
# TODO: any image cleanup (e.g. despeckle?) here | |
cp $file $north_file | |
$CONVERT -rotate 90 $file $east_file | |
$CONVERT -rotate 180 $file $south_file | |
$CONVERT -rotate 270 $file $west_file | |
# OCR each. | |
north_text="$north_file.txt" | |
east_text="$east_file.txt" | |
south_text="$south_file.txt" | |
west_text="$west_file.txt" | |
$OCRAD -f -F utf8 $north_file -o $north_text | |
$OCRAD -f -F utf8 $east_file -o $east_text | |
$OCRAD -f -F utf8 $south_file -o $south_text | |
$OCRAD -f -F utf8 $west_file -o $west_text | |
# Get the word count for each txt file (least 'words' = least whitespace junk) | |
wc_table="$TMP/wc_table" | |
echo "$($WC -w $north_text) $north_file" > $wc_table | |
echo "$($WC -w $east_text) $east_file" >> $wc_table | |
echo "$($WC -w $south_text) $south_file" >> $wc_table | |
echo "$($WC -w $west_text) $west_file" >> $wc_table | |
# Take the bottom two; these are likely right side up and upside down, but | |
# generally too close to call beyond that. | |
bottom_two_wc_table="$TMP/bottom_two_wc_table" | |
$SORT -n $wc_table | $HEAD -2 > $bottom_two_wc_table | |
# Spellcheck. The lowest number of misspelled words is most likely the | |
# correct orientation. | |
misspelled_words_table="$TMP/misspelled_words_table" | |
while read record; do | |
txt=$(echo $record | $AWK '{ print $2 }') | |
misspelled_word_count=$(cat $txt | $ASPELL -l en list | wc -w) | |
echo "$misspelled_word_count $record" >> $misspelled_words_table | |
done < $bottom_two_wc_table | |
# Do the sort, overwrite the input file, save out the text | |
winner=$($SORT -n $misspelled_words_table | $HEAD -1) | |
rotated_file=$(echo $winner | $AWK '{ print $4 }') | |
mv $rotated_file $file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment