cboulanger · May 19, 2021 17:26
diff --git a/create-corpus-from-best-ocr-result.sh b/create-corpus-from-best-ocr-result.sh
 #! /usr/bin/env bash

 # see https://ryanfb.github.io/etc/2015/03/16/automatic_evaluation_of_ocr_quality.html
 # using https://github.com/saffsd/langid.py
 # install with pip install langid and add the scorelines.sh & ocrquality.rb scripts from the blog entry in the same directory

 # The PDF source files, which start with a DOI, adapt this for your case
 FILE_SELECTOR=/path/to/source/dir/*.pdf
 # The path to the directory to which the selected documents should be copied
 TARGET=/path/to/target/dir

 # Clean target
 rm -f $TARGET/*

 for file in $FILE_SELECTOR; do
  
  # select the comparison candidates, adapt this for your case. Here, we have files in different directories with various file extensions.
  doi=10.${file#*\.}
  doi=${doi%\.*}
  candidates="$(ls /path/to/dir1/${doi}* 2> /dev/null) $(ls /path/to/dir2/${doi}* 2> /dev/null) $(ls /path/to/dir3/${doi}* 2> /dev/null)  $(ls /path/to/dir4/${doi}* 2> /dev/null)"
  
  # do the comparison
  max_score=1.0
  best=""
  echo "--- $file ---"
  for candidate in $candidates; do
    score=$(./scorelines.sh $candidate | ./ocrquality.rb)
    echo "    $candidate: $score"
    if (( $(echo "$score < $max_score" | bc -l) )) ; then 
      best=$candidate
      max_score=$score
    fi
  done
  echo "    => Best: $best ($max_score)"
  cp $best $TARGET
 done
	#! /usr/bin/env bash

	# see https://ryanfb.github.io/etc/2015/03/16/automatic_evaluation_of_ocr_quality.html
	# using https://github.com/saffsd/langid.py
	# install with pip install langid and add the scorelines.sh & ocrquality.rb scripts from the blog entry in the same directory

	# The PDF source files, which start with a DOI, adapt this for your case
	FILE_SELECTOR=/path/to/source/dir/*.pdf
	# The path to the directory to which the selected documents should be copied
	TARGET=/path/to/target/dir

	# Clean target
	rm -f $TARGET/*

	for file in $FILE_SELECTOR; do

	# select the comparison candidates, adapt this for your case. Here, we have files in different directories with various file extensions.
	doi=10.${file#*\.}
	doi=${doi%\.*}
	candidates="$(ls /path/to/dir1/${doi}* 2> /dev/null) $(ls /path/to/dir2/${doi}* 2> /dev/null) $(ls /path/to/dir3/${doi}* 2> /dev/null) $(ls /path/to/dir4/${doi}* 2> /dev/null)"

	# do the comparison
	max_score=1.0
	best=""
	echo "--- $file ---"
	for candidate in $candidates; do
	score=$(./scorelines.sh $candidate \| ./ocrquality.rb)
	echo " $candidate: $score"
	if (( $(echo "$score < $max_score" \| bc -l) )) ; then
	best=$candidate
	max_score=$score
	fi
	done
	echo " => Best: $best ($max_score)"
	cp $best $TARGET
	done
No results found