Created
January 15, 2014 13:03
-
-
Save lukaspustina/8435799 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| ############################################################################### | |
| # Lukas Pustina | |
| # This script takes a dictionary (one word by line) as well as multiple text | |
| # files and checks how many words of these text files are part of the | |
| # dictionary. | |
| # | |
| # Primary use is to analyse the quality of tesseract. | |
| ############################################################################### | |
| # Example: | |
| # ./check.py -d ngerman text.txt | |
| ############################################################################### | |
| import sys | |
| import re | |
| def check(args): | |
| (dictionary_file, check_files) = parseArgs(args) | |
| dictionary = read_in_dict_file(dictionary_file) | |
| for f in check_files: | |
| check_file(dictionary, f) | |
| def parseArgs(args): | |
| dictionary_file = None | |
| files = [] | |
| iterator = iter(args) | |
| for a in iterator: | |
| if a == "-d": dictionary_file = iterator.next() | |
| else: files.append(a) | |
| if dictionary_file is None: | |
| print "Dictionary_file not specified." | |
| sys.exit(-1) | |
| if not files: | |
| print "No files to check specified." | |
| sys.exit(-2) | |
| return (dictionary_file, files) | |
| def read_in_dict_file(dictionary_file): | |
| dictionary = [] | |
| fh = open(dictionary_file) | |
| for l in fh.readlines(): | |
| dictionary.append(l.strip().lower()) | |
| return dictionary | |
| def check_file(dictionary, f): | |
| word_counter = 0 | |
| fh = open(f) | |
| text = fh.read() | |
| for word in re.split("\W+", text): | |
| if len(word) is 1: continue | |
| if word.lower() in dictionary: | |
| word_counter += 1 | |
| print "%s: %d" % (f, word_counter) | |
| if __name__ == '__main__': | |
| if len(sys.argv) <= 1: | |
| print " -d dictionary_file <file to check> [file to check|...]" | |
| sys.exit(-3) | |
| check(sys.argv[1:]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment