Last active
January 2, 2016 21:19
-
-
Save lukerosiak/8362479 to your computer and use it in GitHub Desktop.
Parse appropriations from Cato XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import re | |
import csv | |
from bs4 import BeautifulSoup | |
#format numbers | |
def commafy(x): | |
if type(x) not in [type(0), type(0L)]: | |
raise TypeError("Parameter must be an integer.") | |
if x < 0: | |
return '-' + intWithCommas(-x) | |
result = '' | |
while x >= 1000: | |
x, r = divmod(x, 1000) | |
result = ",%03d%s" % (r, result) | |
return "%d%s" % (x, result) | |
#use govtrack's bioguide spreadheet to get state and formal name of lawmakers. we need two lookup tables because they could use 1 of 2 kinds of ids | |
#the csv comes from https://github.com/unitedstates/congress-legislators | |
#i have a simple pythons script that converts current_legislators from yaml to csv | |
bioguide = {} | |
fbioguide = csv.DictReader(open('bioguide.csv','r')) | |
for line in fbioguide: | |
bioguide[line['bioguide']] = line | |
bioguide_lis = {} | |
fbioguide = csv.DictReader(open('bioguide.csv','r')) | |
for line in fbioguide: | |
bioguide_lis[line['lis']] = line | |
def removeNonAscii(s): return "".join(filter(lambda x: ord(x)<128, s)) | |
def process_sponsor(x): | |
if not x.find('sponsor'): | |
return (None,None,None,None) | |
(sponsor_name,sponsor_id) = (x.find('sponsor').string, x.find('sponsor')['name-id']) | |
b = None | |
if sponsor_id in bioguide: | |
b = bioguide[sponsor_id] | |
elif sponsor_id in bioguide_lis: | |
b = bioguide_lis[sponsor_id] | |
if b: | |
govtrack = b['govtrack'] | |
state = b['state'] | |
sponsor_name = b['term_type'].title() + '. ' + b['official_full'] + ' (' + b['party'][0].upper() + '-' + state + ')' | |
sponsor_name = removeNonAscii(sponsor_name) | |
else: | |
govtrack = '' | |
state = '' | |
return (sponsor_name,sponsor_id,govtrack,state) | |
os.system('mkdir output') | |
os.system('rm dl -rf') | |
os.system('mkdir dl') | |
os.system('wget http://deepbills.cato.org/download -O dl/dl.zip') | |
os.system('unzip dl/dl.zip -d dl') | |
os.system('chmod 755 -R dl') #weird permissions issue with Cato ZIP file | |
billlist = [] | |
reg = re.compile('(\d+)(\w+\d+)') | |
dir_files = os.listdir('dl/bills') | |
for f in dir_files: | |
#split the filename into the congress and bill number to generate link to govtrack | |
m = reg.match(f.replace('.xml','')) | |
link = m.groups() | |
(congress,billnum) = link | |
#if there's another file representing the passed version of the bill, skip this one | |
if f.endswith('ih.xml'): | |
if '%s%seh.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srh.xml' % (congress,billnum) in dir_files: | |
continue | |
elif f.endswith('is.xml'): | |
if '%s%ses.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srs.xml' % (congress,billnum) in dir_files: | |
continue | |
if f.endswith('ih.xml'): | |
if '%s%seh.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srh.xml' % (congress,billnum) in dir_files: | |
continue | |
elif f.endswith('is.xml'): | |
if '%s%ses.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srs.xml' % (congress,billnum) in dir_files: | |
continue | |
s = removeNonAscii(open('dl/bills/'+f,'r').read()) | |
x = BeautifulSoup(s,"xml") | |
title = x.find('title') | |
try: | |
title = title.string | |
except: | |
pass | |
if not title: | |
if x.find('short-title'): | |
title = x.find('short-title').string | |
elif x.find('official-title'): | |
title = x.find('official-title').string | |
a3 = [] #there can be man appropriations in a bill; this is a list of them | |
amount = 0 #the running tally of total cost | |
indefinite = False #does "such sums as necessary" appear anywhere? if so, that changes our dollar amount | |
for a in x.findAll('funds-and-year'): | |
print a | |
if a.string: a3.append(a.string) | |
if a.get('amount'): | |
if a['amount']=='indefinite': | |
indefinite=True | |
else: | |
try: | |
amount += int(a['amount']) | |
except: | |
print a, 'amount couldnt be parsed as int' | |
try: | |
introdate = x.find('date').string | |
except: | |
introdate = None | |
lastaction = [] | |
for a in x.findAll('action-date'): | |
try: | |
lastaction.append(a['date']) | |
except: | |
pass | |
if x.find('attestation-date') and x.find('attestation-date').get('date') and len(x.find('attestation-date').get('date'))==8: | |
#special case for getting date from passed bills | |
#="20130730" chamber="House">Passed the House of Representatives July 30, 2013.</attestation-date> | |
lastaction.append( x.find('attestation-date').get('date') ) | |
lastaction.sort() | |
if len(lastaction)>0: | |
actiondate = lastaction[-1] | |
try: | |
actiondate = actiondate[:4]+'-'+actiondate[4:6]+'-'+actiondate[6:] | |
except: | |
pass | |
else: | |
actiondate = None | |
actionall = ';'.join(lastaction) | |
status = None | |
try: | |
status = x.resolution['resolution-stage'] | |
except: | |
try: | |
status = x.bill['bill-stage'] | |
status = status.replace('-',' ') | |
except: | |
pass | |
pass | |
purpose = x.find('property',{'name':'purpose'}) | |
if purpose: purpose = purpose.string | |
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x) | |
if not sponsor_name and (f.endswith('es.xml') or f.endswith('eh.xml')): | |
#passed bills dont include sponsors, so open up the file showing its earlier version to grab it | |
earlier_version = None | |
if f.endswith('eh.xml'): | |
status = 'Passed the House' | |
if '%s%sih.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%sih.xml' % (congress,billnum) | |
elif '%s%srh.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%srh.xml' % (congress,billnum) | |
elif f.endswith('is.xml'): | |
status = 'Passed the Senate' | |
if '%s%sis.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%sis.xml' % (congress,billnum) | |
elif '%s%srs.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%srs.xml' % (congress,billnum) | |
if earlier_version: | |
s_earlier = removeNonAscii(open('dl/bills/'+earlier_version,'r').read()) | |
x_earlier = BeautifulSoup(s_earlier,"xml") | |
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x_earlier) | |
#only list bills with money | |
if amount>0 or indefinite: | |
descrip = '; '.join(a3) | |
if indefinite: | |
if amount==0: | |
display_amount = 'such sums as necessary'; | |
else: | |
display_amount = '$'+commafy(amount) + ' + such sums as necessary'; | |
else: | |
display_amount = '$'+commafy(amount); | |
d = {'id':f,'title':title,'funds-and-year':descrip,'amount':amount, 'indefinite': indefinite, 'display_amount': display_amount, 'sponsorid':sponsor_id, 'sponsorname':sponsor_name, 'status':status, 'actiondate':actiondate,'introdate': introdate, 'actionall': actionall, 'purpose': purpose, 'congress': congress, 'billnum': billnum, 'govtrack': govtrack, 'state': state} | |
billlist.append(d) | |
#order them by how recently anything has happened with them | |
billlist.sort(key=lambda k: k['actiondate'],reverse=True) | |
fshort = open('output/short.json','w') | |
fshort.write('catoshort(' + json.dumps(billlist[:5]) + ');') | |
fshort.close() | |
flong = open('output/long.json','w') | |
flong.write('cato(' + json.dumps(billlist) + ');') | |
flong.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment