Created
April 24, 2012 20:36
-
-
Save drinks/2483508 to your computer and use it in GitHub Desktop.
Flesch-Kincaid grade level processing against Capitol Words API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import division | |
from curses.ascii import isdigit | |
import json | |
import sys | |
import datetime | |
# import re | |
from django.utils.datastructures import SortedDict | |
from django.contrib.localflavor.us.us_states import STATE_CHOICES | |
from nltk import sent_tokenize, regexp_tokenize | |
from nltk.corpus import cmudict | |
from sunlight import capitolwords as cw | |
from sunlight import congress | |
SYLLABLE_AVG = 1.66 | |
STARTING_CONGRESS = 104 | |
CURRENT_CONGRESS = 112 | |
PER_PAGE = 1000 | |
DICT = cmudict.dict() | |
LEGISLATORS = {} | |
for leg in congress.legislators(): | |
if leg['bioguide_id']: | |
LEGISLATORS[leg['bioguide_id']] = leg | |
STATES = [state[0] for state in STATE_CHOICES] | |
def tokenize(term): | |
# Adapted From Natural Language Processing with Python | |
regex = r'''(?xi) | |
(?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills | |
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.) | |
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.) | |
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc. | |
| \d*\.\d+ # Numbers with decimal points. | |
| \d\d?:\d\d # Times. | |
| \$?[,\.0-9]+\d # Numbers with thousands separators, (incl currency). | |
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M. | |
| \w+((-|')\w+)* # Words with optional internal hyphens. | |
| \$?\d+(\.\d+)?%? # Currency and percentages. | |
| (?<=\b)\.\.\.(?=\b) # Ellipses surrounded by word borders | |
| [][.,;"'?():-_`] | |
''' | |
# Strip punctuation from this one; solr doesn't know about any of it | |
tokens = regexp_tokenize(term, regex) | |
# tokens = [re.sub(r'[.,?!]', '', token) for token in tokens] # instead of this we just test word length | |
return tokens | |
def nsyl(word): | |
return [len(list(y for y in x if isdigit(y[-1]))) for x in DICT[word.lower()]][0] | |
if __name__ == '__main__': | |
from optparse import OptionParser | |
parser = OptionParser() | |
parser.add_option('--facet', dest='facets', action='append', type='choice', default=[], | |
choices=('month', 'year', 'chamber', 'congress', 'party', 'bioguide', 'state'), | |
help='Set the facet(s) to aggregate grade levels over') | |
parser.add_option('--start-date', dest='start_date', default='1996-01-01', | |
help='How far back to limit the search, Required with month or year faceting.') | |
parser.add_option('--end-date', dest='end_date', default=datetime.datetime.now().strftime('%Y-%m-%d'), | |
help='How far forward to limit the search, required with month or year faceting.') | |
options, args = parser.parse_args() | |
if not options.facets: | |
options.facets = ['bioguide'] | |
try: | |
start_date = options.start_date | |
except AttributeError: | |
start_date = None | |
try: | |
end_date = options.end_date | |
except AttributeError: | |
end_date = None | |
kwargs = SortedDict() | |
calls = [] | |
results = {} | |
if 'bioguide' in options.facets: | |
kwargs['bioguide_id'] = LEGISLATORS.keys() | |
if 'chamber' in options.facets: | |
kwargs['chamber'] = ['house', 'senate'] | |
if 'party' in options.facets: | |
kwargs['party'] = ['D', 'R', 'I'] | |
if 'state' in options.facets: | |
kwargs['state'] = STATES | |
if 'month' in options.facets: | |
import calendar | |
cal = calendar.Calendar() | |
start_month = (int(start_date.split('-')[0]), int(start_date.split('-')[1])) | |
end_month = (int(end_date.split('-')[0]), int(end_date.split('-')[1])) | |
current_month = list(start_month) | |
starts = [] | |
ends = [] | |
while current_month[0] < end_month[0] or current_month[1] <= end_month[1]: | |
starts.append('%d-%d-01' % (current_month[0], current_month[1])) | |
days_in_month = len([day for day in cal.itermonthdays(current_month[0], current_month[1]) if day > 0]) | |
ends.append('%d-%d-%d' % (current_month[0], current_month[1], days_in_month)) | |
current_month[1] += 1 | |
if current_month[1] == 13: | |
current_month[0] += 1 | |
current_month[1] = 1 | |
kwargs['dates'] = zip(starts, ends) | |
elif 'year' in options.facets: | |
starts = [] | |
ends = [] | |
start_year = int(start_date.split('-')[0]) | |
end_year = int(end_date.split('-')[0]) | |
for year in (range(start_year, end_year) + [end_year]): | |
starts.append('%d-01-01' % year) | |
ends.append('%d-12-31' % year) | |
kwargs['dates'] = zip(starts, ends) | |
elif 'date' in options.facets: | |
raise NotImplementedError('Sorry, dates aren\'t available yet') | |
elif 'congress' in options.facets: | |
kwargs['congress'] = range(STARTING_CONGRESS, CURRENT_CONGRESS + 1) | |
if 'month' in options.facets or 'year' in options.facets or 'date' in options.facets: | |
start_date = None | |
end_date = None | |
if start_date: | |
kwargs['start_date'] = [start_date] | |
if end_date: | |
kwargs['end_date'] = [end_date] | |
cursor = SortedDict() | |
boundary = SortedDict() | |
for key in kwargs.keys(): | |
cursor[key] = 0 | |
boundary[key] = len(kwargs[key]) - 1 | |
itercursor = cursor.keys() | |
itercursor.reverse() | |
try: | |
filename = parser.parse_args()[1][0] | |
file = open(filename, 'w+') | |
except Exception, e: | |
print e | |
print 'usage: ./calculate_grade_levels.py [options] <filename>' | |
sys.exit() | |
file.write('[') | |
while cursor.values() <= boundary.values(): | |
page = 0 | |
corpus = '' | |
call = {} | |
for key, val in cursor.items(): | |
if key == 'dates': | |
call['start_date'] = kwargs[key][val][0] | |
call['end_date'] = kwargs[key][val][1] | |
else: | |
call[key] = kwargs[key][val] | |
callsum = '-'.join(['-'.join([key, str(val)]) for (key, val) in call.items()]) | |
print callsum | |
call.update(phrase='*', per_page=PER_PAGE) | |
if not call.get('bioguide_id'): | |
call.update(bioguide_id="['' TO *]") | |
while True: | |
print 'page %d...' % page | |
call.update(page=page) | |
resp = cw.text(**call) | |
if not len(resp): | |
break | |
page += 1 | |
for chunk in resp: | |
try: | |
corpus += ' %s' % ' '.join(chunk['speaking']) | |
except TypeError: | |
corpus += ' %s' % str(chunk['speaking']) | |
except: | |
pass | |
words = [word for word in tokenize(corpus) if (len(word) > 1) or (word.lower() in ['a', 'i'])] | |
sentences = sent_tokenize(corpus) | |
syllables = [] | |
misses = [] | |
for word in words: | |
try: | |
syllables.append(nsyl(word)) | |
except KeyError: | |
misses.append(word) | |
word_count = len(words) | |
sentence_count = len(sentences) | |
# pad syllable count out to word count | |
missing_syllables = word_count - len(syllables) | |
for i in range(missing_syllables): | |
syllables.append(SYLLABLE_AVG) | |
syllable_count = sum(syllables) | |
if word_count > 0 and sentence_count > 0: | |
results[callsum] = { | |
'words': word_count, | |
'syllables': syllable_count, | |
'missed_count': missing_syllables, | |
'missed_pct': missing_syllables / word_count, | |
'sentences': sentence_count, | |
'grade_level': (0.39 * (word_count / sentence_count)) + (11.8 * (syllable_count / word_count)) - 15.59, | |
'reading_ease': 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count)), | |
} | |
if call.get('bioguide_id') and not call['bioguide_id'].startswith('['): | |
leg = LEGISLATORS[call['bioguide_id']] | |
results[callsum].update(bioguide_id=call['bioguide_id'], | |
party=leg['party'], | |
chamber=leg['chamber'], | |
state=leg['state'], | |
name='%s, %s' % (leg['lastname'], leg['firstname'])) | |
if call.get('start_date') and call.get('end_date'): | |
results[callsum].update(start_date=call['start_date'], end_date=call['end_date']) | |
if call.get('chamber'): | |
results[callsum].update(chamber=call['chamber']) | |
if call.get('party'): | |
results[callsum].update(party=call['party']) | |
if call.get('congress'): | |
results[callsum].update(congress=call['congress']) | |
file.write(json.dumps(results[callsum])) | |
file.flush() | |
# break out of the loop if we're done | |
if cursor.values() == boundary.values(): | |
break | |
else: | |
file.write(",\n") | |
# otherwise, increment from right to left | |
for key in itercursor: | |
if cursor[key] < boundary[key]: | |
cursor[key] += 1 | |
break | |
else: | |
cursor[key] = 0 | |
file.write(']') | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment