Skip to content

Instantly share code, notes, and snippets.

@kisom
Created May 24, 2012 12:35
Show Gist options
  • Save kisom/2781329 to your computer and use it in GitHub Desktop.
Save kisom/2781329 to your computer and use it in GitHub Desktop.
wordcount
#!/usr/bin/env python
"""
Calculate a number of statistics about a document in English. Includes
the number of words, lexical diversity, and Flesh-Kincaid grade level.
Usage:
wordcount.py <textfile>
"""
import re
import sys
DEBUG = False
def build_wordlist(filename):
"""
Open a file and build a list of all words and unique words.
Returns a dictionary with the number of words, the number of
unique words, the average word length, and the word freqency
dictionary.
"""
text = open(filename).read()
# clean out newlines and runs of whitespace
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\s+', ' ', text)
sentences = count_sentences(text)
words = [scrub_word(word) for word in text.split(' ')
if valid_word(word)]
wdict = {} # unique words
syllables = {}
wordlen = 0.0
for word in words:
if not word in wdict:
wdict[word] = 0
wdict[word] += 1
if not word in syllables:
syllables[word] = count_syllables(word)
for word in wdict:
wordlen += len(word)
wordlist = {
'num_words': len(words),
'unique_words': len(wdict),
'avg_len': wordlen / len(wdict),
'wdict': wdict,
'syllables': syllables,
'sentences': sentences
}
wordlist['grade level'] = grade_level(wordlist)
return wordlist
def grade_level(wordlist):
"""Calculate Flesch-Kincaid Grade Level"""
syllables = wordlist['syllables']
syllables = sum([syllables[word] for word in syllables])
sent = 0.39 * (wordlist['num_words'] / wordlist['sentences'])
syl = 11.8 * (syllables / wordlist['num_words'])
grade = sent + syl - 15.59
return grade
def valid_word(word):
"""
Determine if a word should be included in the word list.
"""
valid = True
if re.match(r'^https?://', word, re.U):
valid = False
elif re.match(r'^\[', word, re.U):
valid = False
if DEBUG and not valid:
print "invalid word: %s" % (word, )
return valid
def scrub_word(word):
"""Clean up words to get an accurate length count."""
word = word.lower().strip()
# clean out references
word = re.sub(r'\[\d+\]', '', word)
# clean out punctuation
word = re.sub(r'[:;,.)!?]+$', '', word)
# clean out starting parens
word = re.sub(r'^[(]+', '', word)
return word
def count_syllables(word):
"""Count the number of syllables in a word."""
match = re.findall(r'[aeiouy]+', word, re.U)
syllables = 1
if match:
syllables = len(match)
if re.search(r'[^aeiouy]e$', word):
syllables -= 1
return syllables
def count_sentences(text):
"""Count the number of sentences in a bit of text."""
return len(re.findall(r'[.?!]\s+[A-Z0-9]', text))
if __name__ == '__main__':
if len(sys.argv) < 2:
exit(1)
WORDLIST = build_wordlist(sys.argv[1])
print "number of words: %d" % (WORDLIST['num_words'], )
print "number of unique words: %d" % (WORDLIST['unique_words'], )
print "number of sentences: %d" % (WORDLIST['sentences'], )
print "average word length: %0.1f" % (WORDLIST['avg_len'], )
print "lexical diversity: %0.2f" % (WORDLIST['unique_words'] /
(WORDLIST['num_words'] + 0.0), )
print "grade level: %0.1f" % (WORDLIST['grade level'], )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment