Created
February 14, 2013 06:20
-
-
Save nishad/4950947 to your computer and use it in GitHub Desktop.
Tesseract lang creation script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
TRAININGDIR="./Trained" | |
SUFFIX="tif" | |
FONTPROPERTIES="$TRAININGDIR/font_properties" | |
for f in $(find $TRAININGDIR -iname "*.$SUFFIX") | |
do | |
# Get the name of the file | |
filename=$(echo ${f%%.$SUFFIX} | sed 's#^.*/##') | |
# Get the path of the file | |
path=`dirname $f` | |
# Train the files | |
tesseract $f $path/$filename nobatch box.train | |
# While we are at it, create a null font_properties | |
echo $filename 0 0 0 0 0 >> $FONTPROPERTIES | |
done | |
# Get a list of all the Box files | |
ALLBOX="" | |
for f in $(find $TRAININGDIR -iname "*.box") | |
do | |
ALLBOX=${ALLBOX}" "${f} | |
done | |
# Create the unicharset | |
unicharset_extractor $ALLBOX | |
# Move it to the appropriate place | |
mv unicharset $TRAININGDIR | |
# Get a list of all the Box files | |
ALLTR="" | |
for f in $(find $TRAININGDIR -iname "*.tr") | |
do | |
ALLTR=${ALLTR}" "${f} | |
done | |
# Perform the clustered training | |
mftraining -F $FONTPROPERTIES -U $TRAININGDIR/unicharset $ALLTR | |
cntraining $ALLTR | |
# Move files to appropriate place | |
mv inttemp $TRAININGDIR/blast.inttemp | |
mv mfunicharset $TRAININGDIR/blast.mfunicharset | |
mv Microfeat $TRAININGDIR/blast.Microfeat | |
mv normproto $TRAININGDIR/blast.normproto | |
mv pffmtable $TRAININGDIR/blast.pffmtable | |
mv $TRAININGDIR/unicharset $TRAININGDIR/blast.unicharset | |
combine_tessdata $TRAININGDIR/blast. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment