kisom · May 24, 2012 12:35
diff --git a/wordcount.py b/wordcount.py

 #!/usr/bin/env python
 """
 Calculate a number of statistics about a document in English. Includes
 the number of words, lexical diversity, and Flesh-Kincaid grade level.

 Usage:
    wordcount.py <textfile>
 """

 import re
 import sys


 DEBUG = False


 def build_wordlist(filename):
    """
    Open a file and build a list of all words and unique words.
    Returns a dictionary with the number of words, the number of
    unique words, the average word length, and the word freqency
    dictionary.
    """
    text = open(filename).read()

    # clean out newlines and runs of whitespace
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    sentences = count_sentences(text)

    words = [scrub_word(word) for word in text.split(' ')
             if valid_word(word)]

    wdict = {}                      # unique words
    syllables = {}
    wordlen = 0.0

    for word in words:
        if not word in wdict:
            wdict[word] = 0
        wdict[word] += 1

        if not word in syllables:
            syllables[word] = count_syllables(word)

    for word in wdict:
        wordlen += len(word)

    wordlist = {
            'num_words': len(words),
            'unique_words': len(wdict),
            'avg_len': wordlen / len(wdict),
            'wdict': wdict,
            'syllables': syllables,
            'sentences': sentences
    }

    wordlist['grade level'] = grade_level(wordlist)
    return wordlist


 def grade_level(wordlist):
    """Calculate Flesch-Kincaid Grade Level"""

    syllables = wordlist['syllables']
    syllables = sum([syllables[word] for word in syllables])

    sent = 0.39 * (wordlist['num_words'] / wordlist['sentences'])
    syl = 11.8 * (syllables / wordlist['num_words'])

    grade = sent + syl - 15.59
    return grade


 def valid_word(word):
    """
    Determine if a word should be included in the word list.
    """

    valid = True
    if re.match(r'^https?://', word, re.U):
        valid = False
    elif re.match(r'^\[', word, re.U):
        valid = False
    if DEBUG and not valid:
        print "invalid word: %s" % (word, )
    return valid


 def scrub_word(word):
    """Clean up words to get an accurate length count."""

    word = word.lower().strip()

    # clean out references
    word = re.sub(r'\[\d+\]', '', word)

    # clean out punctuation
    word = re.sub(r'[:;,.)!?]+$', '', word)

    # clean out starting parens
    word = re.sub(r'^[(]+', '', word)

    return word


 def count_syllables(word):
    """Count the number of syllables in a word."""
    match = re.findall(r'[aeiouy]+', word, re.U)
    syllables = 1

    if match:
        syllables = len(match)
        if re.search(r'[^aeiouy]e$', word):
            syllables -= 1

    return syllables


 def count_sentences(text):
    """Count the number of sentences in a bit of text."""
    return len(re.findall(r'[.?!]\s+[A-Z0-9]', text))


 if __name__ == '__main__':
    if len(sys.argv) < 2:
        exit(1)

    WORDLIST = build_wordlist(sys.argv[1])
    print "number of words: %d" % (WORDLIST['num_words'], )
    print "number of unique words: %d" % (WORDLIST['unique_words'], )
    print "number of sentences: %d" % (WORDLIST['sentences'], )
    print "average word length: %0.1f" % (WORDLIST['avg_len'], )
    print "lexical diversity: %0.2f" % (WORDLIST['unique_words'] /
                                       (WORDLIST['num_words'] + 0.0), )
    print "grade level: %0.1f" % (WORDLIST['grade level'], )

	#!/usr/bin/env python
	"""
	Calculate a number of statistics about a document in English. Includes
	the number of words, lexical diversity, and Flesh-Kincaid grade level.

	Usage:
	wordcount.py <textfile>
	"""

	import re
	import sys


	DEBUG = False


	def build_wordlist(filename):
	"""
	Open a file and build a list of all words and unique words.
	Returns a dictionary with the number of words, the number of
	unique words, the average word length, and the word freqency
	dictionary.
	"""
	text = open(filename).read()

	# clean out newlines and runs of whitespace
	text = re.sub(r'\n', ' ', text)
	text = re.sub(r'\s+', ' ', text)

	sentences = count_sentences(text)

	words = [scrub_word(word) for word in text.split(' ')
	if valid_word(word)]

	wdict = {} # unique words
	syllables = {}
	wordlen = 0.0

	for word in words:
	if not word in wdict:
	wdict[word] = 0
	wdict[word] += 1

	if not word in syllables:
	syllables[word] = count_syllables(word)

	for word in wdict:
	wordlen += len(word)

	wordlist = {
	'num_words': len(words),
	'unique_words': len(wdict),
	'avg_len': wordlen / len(wdict),
	'wdict': wdict,
	'syllables': syllables,
	'sentences': sentences
	}

	wordlist['grade level'] = grade_level(wordlist)
	return wordlist


	def grade_level(wordlist):
	"""Calculate Flesch-Kincaid Grade Level"""

	syllables = wordlist['syllables']
	syllables = sum([syllables[word] for word in syllables])

	sent = 0.39 * (wordlist['num_words'] / wordlist['sentences'])
	syl = 11.8 * (syllables / wordlist['num_words'])

	grade = sent + syl - 15.59
	return grade


	def valid_word(word):
	"""
	Determine if a word should be included in the word list.
	"""

	valid = True
	if re.match(r'^https?://', word, re.U):
	valid = False
	elif re.match(r'^\[', word, re.U):
	valid = False
	if DEBUG and not valid:
	print "invalid word: %s" % (word, )
	return valid


	def scrub_word(word):
	"""Clean up words to get an accurate length count."""

	word = word.lower().strip()

	# clean out references
	word = re.sub(r'\[\d+\]', '', word)

	# clean out punctuation
	word = re.sub(r'[:;,.)!?]+$', '', word)

	# clean out starting parens
	word = re.sub(r'^[(]+', '', word)

	return word


	def count_syllables(word):
	"""Count the number of syllables in a word."""
	match = re.findall(r'[aeiouy]+', word, re.U)
	syllables = 1

	if match:
	syllables = len(match)
	if re.search(r'[^aeiouy]e$', word):
	syllables -= 1

	return syllables


	def count_sentences(text):
	"""Count the number of sentences in a bit of text."""
	return len(re.findall(r'[.?!]\s+[A-Z0-9]', text))


	if __name__ == '__main__':
	if len(sys.argv) < 2:
	exit(1)

	WORDLIST = build_wordlist(sys.argv[1])
	print "number of words: %d" % (WORDLIST['num_words'], )
	print "number of unique words: %d" % (WORDLIST['unique_words'], )
	print "number of sentences: %d" % (WORDLIST['sentences'], )
	print "average word length: %0.1f" % (WORDLIST['avg_len'], )
	print "lexical diversity: %0.2f" % (WORDLIST['unique_words'] /
	(WORDLIST['num_words'] + 0.0), )
	print "grade level: %0.1f" % (WORDLIST['grade level'], )