Created
May 24, 2012 12:35
-
-
Save kisom/2781329 to your computer and use it in GitHub Desktop.
wordcount
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Calculate a number of statistics about a document in English. Includes | |
the number of words, lexical diversity, and Flesh-Kincaid grade level. | |
Usage: | |
wordcount.py <textfile> | |
""" | |
import re | |
import sys | |
DEBUG = False | |
def build_wordlist(filename): | |
""" | |
Open a file and build a list of all words and unique words. | |
Returns a dictionary with the number of words, the number of | |
unique words, the average word length, and the word freqency | |
dictionary. | |
""" | |
text = open(filename).read() | |
# clean out newlines and runs of whitespace | |
text = re.sub(r'\n', ' ', text) | |
text = re.sub(r'\s+', ' ', text) | |
sentences = count_sentences(text) | |
words = [scrub_word(word) for word in text.split(' ') | |
if valid_word(word)] | |
wdict = {} # unique words | |
syllables = {} | |
wordlen = 0.0 | |
for word in words: | |
if not word in wdict: | |
wdict[word] = 0 | |
wdict[word] += 1 | |
if not word in syllables: | |
syllables[word] = count_syllables(word) | |
for word in wdict: | |
wordlen += len(word) | |
wordlist = { | |
'num_words': len(words), | |
'unique_words': len(wdict), | |
'avg_len': wordlen / len(wdict), | |
'wdict': wdict, | |
'syllables': syllables, | |
'sentences': sentences | |
} | |
wordlist['grade level'] = grade_level(wordlist) | |
return wordlist | |
def grade_level(wordlist): | |
"""Calculate Flesch-Kincaid Grade Level""" | |
syllables = wordlist['syllables'] | |
syllables = sum([syllables[word] for word in syllables]) | |
sent = 0.39 * (wordlist['num_words'] / wordlist['sentences']) | |
syl = 11.8 * (syllables / wordlist['num_words']) | |
grade = sent + syl - 15.59 | |
return grade | |
def valid_word(word): | |
""" | |
Determine if a word should be included in the word list. | |
""" | |
valid = True | |
if re.match(r'^https?://', word, re.U): | |
valid = False | |
elif re.match(r'^\[', word, re.U): | |
valid = False | |
if DEBUG and not valid: | |
print "invalid word: %s" % (word, ) | |
return valid | |
def scrub_word(word): | |
"""Clean up words to get an accurate length count.""" | |
word = word.lower().strip() | |
# clean out references | |
word = re.sub(r'\[\d+\]', '', word) | |
# clean out punctuation | |
word = re.sub(r'[:;,.)!?]+$', '', word) | |
# clean out starting parens | |
word = re.sub(r'^[(]+', '', word) | |
return word | |
def count_syllables(word): | |
"""Count the number of syllables in a word.""" | |
match = re.findall(r'[aeiouy]+', word, re.U) | |
syllables = 1 | |
if match: | |
syllables = len(match) | |
if re.search(r'[^aeiouy]e$', word): | |
syllables -= 1 | |
return syllables | |
def count_sentences(text): | |
"""Count the number of sentences in a bit of text.""" | |
return len(re.findall(r'[.?!]\s+[A-Z0-9]', text)) | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
exit(1) | |
WORDLIST = build_wordlist(sys.argv[1]) | |
print "number of words: %d" % (WORDLIST['num_words'], ) | |
print "number of unique words: %d" % (WORDLIST['unique_words'], ) | |
print "number of sentences: %d" % (WORDLIST['sentences'], ) | |
print "average word length: %0.1f" % (WORDLIST['avg_len'], ) | |
print "lexical diversity: %0.2f" % (WORDLIST['unique_words'] / | |
(WORDLIST['num_words'] + 0.0), ) | |
print "grade level: %0.1f" % (WORDLIST['grade level'], ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment