Last active
May 19, 2021 17:26
-
-
Save cboulanger/cb4a99f7e03fb86141e511f15e3cfc5e to your computer and use it in GitHub Desktop.
Selects from different UTF-8 documents that are the result of OCR processing of the same source document, choosing the one with the highest quality (i.e. highest language recognition confidence)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env bash | |
# see https://ryanfb.github.io/etc/2015/03/16/automatic_evaluation_of_ocr_quality.html | |
# using https://github.com/saffsd/langid.py | |
# install with pip install langid and add the scorelines.sh & ocrquality.rb scripts from the blog entry in the same directory | |
# The PDF source files, which start with a DOI, adapt this for your case | |
FILE_SELECTOR=/path/to/source/dir/*.pdf | |
# The path to the directory to which the selected documents should be copied | |
TARGET=/path/to/target/dir | |
# Clean target | |
rm -f $TARGET/* | |
for file in $FILE_SELECTOR; do | |
# select the comparison candidates, adapt this for your case. Here, we have files in different directories with various file extensions. | |
doi=10.${file#*\.} | |
doi=${doi%\.*} | |
candidates="$(ls /path/to/dir1/${doi}* 2> /dev/null) $(ls /path/to/dir2/${doi}* 2> /dev/null) $(ls /path/to/dir3/${doi}* 2> /dev/null) $(ls /path/to/dir4/${doi}* 2> /dev/null)" | |
# do the comparison | |
max_score=1.0 | |
best="" | |
echo "--- $file ---" | |
for candidate in $candidates; do | |
score=$(./scorelines.sh $candidate | ./ocrquality.rb) | |
echo " $candidate: $score" | |
if (( $(echo "$score < $max_score" | bc -l) )) ; then | |
best=$candidate | |
max_score=$score | |
fi | |
done | |
echo " => Best: $best ($max_score)" | |
cp $best $TARGET | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment