Skip to content

Instantly share code, notes, and snippets.

@lukaspustina
Created January 15, 2014 13:03
Show Gist options
  • Select an option

  • Save lukaspustina/8435799 to your computer and use it in GitHub Desktop.

Select an option

Save lukaspustina/8435799 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
###############################################################################
# Lukas Pustina
# This script takes a dictionary (one word by line) as well as multiple text
# files and checks how many words of these text files are part of the
# dictionary.
#
# Primary use is to analyse the quality of tesseract.
###############################################################################
# Example:
# ./check.py -d ngerman text.txt
###############################################################################
import sys
import re
def check(args):
(dictionary_file, check_files) = parseArgs(args)
dictionary = read_in_dict_file(dictionary_file)
for f in check_files:
check_file(dictionary, f)
def parseArgs(args):
dictionary_file = None
files = []
iterator = iter(args)
for a in iterator:
if a == "-d": dictionary_file = iterator.next()
else: files.append(a)
if dictionary_file is None:
print "Dictionary_file not specified."
sys.exit(-1)
if not files:
print "No files to check specified."
sys.exit(-2)
return (dictionary_file, files)
def read_in_dict_file(dictionary_file):
dictionary = []
fh = open(dictionary_file)
for l in fh.readlines():
dictionary.append(l.strip().lower())
return dictionary
def check_file(dictionary, f):
word_counter = 0
fh = open(f)
text = fh.read()
for word in re.split("\W+", text):
if len(word) is 1: continue
if word.lower() in dictionary:
word_counter += 1
print "%s: %d" % (f, word_counter)
if __name__ == '__main__':
if len(sys.argv) <= 1:
print " -d dictionary_file <file to check> [file to check|...]"
sys.exit(-3)
check(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment