lukerosiak · January 2, 2016 21:19
diff --git a/CatoApprops.py b/CatoApprops.py
 import os
 import json
 import re
 import csv

 from bs4 import BeautifulSoup


 #format numbers
 def commafy(x):
    if type(x) not in [type(0), type(0L)]:
        raise TypeError("Parameter must be an integer.")
    if x < 0:
        return '-' + intWithCommas(-x)
    result = ''
    while x >= 1000:
        x, r = divmod(x, 1000)
        result = ",%03d%s" % (r, result)
    return "%d%s" % (x, result)
    

 #use govtrack's bioguide spreadheet to get state and formal name of lawmakers. we need two lookup tables because they could use 1 of 2 kinds of ids    
 #the csv comes from https://github.com/unitedstates/congress-legislators
 #i have a simple pythons script that converts current_legislators from yaml to csv
 bioguide = {}
 fbioguide = csv.DictReader(open('bioguide.csv','r'))
 for line in fbioguide:
    bioguide[line['bioguide']] = line

 bioguide_lis = {}
 fbioguide = csv.DictReader(open('bioguide.csv','r'))
 for line in fbioguide:
    bioguide_lis[line['lis']] = line


 def removeNonAscii(s): return "".join(filter(lambda x: ord(x)<128, s))

 def process_sponsor(x):
    if not x.find('sponsor'):
        return (None,None,None,None)

    (sponsor_name,sponsor_id) = (x.find('sponsor').string, x.find('sponsor')['name-id'])
            
    b = None
    if sponsor_id in bioguide:
        b = bioguide[sponsor_id]
    elif sponsor_id in bioguide_lis:
        b = bioguide_lis[sponsor_id]
    if b:
        govtrack = b['govtrack']           
        state = b['state']
        sponsor_name = b['term_type'].title() + '. ' + b['official_full'] + ' (' + b['party'][0].upper() + '-' + state + ')'
        sponsor_name = removeNonAscii(sponsor_name)
    else:
        govtrack = ''
        state = ''
        
    return (sponsor_name,sponsor_id,govtrack,state)

 os.system('mkdir output')
 os.system('rm dl -rf')
 os.system('mkdir dl')
 os.system('wget http://deepbills.cato.org/download -O dl/dl.zip')
 os.system('unzip dl/dl.zip -d dl')
 os.system('chmod 755 -R dl') #weird permissions issue with Cato ZIP file

 billlist = []

 reg = re.compile('(\d+)(\w+\d+)')


 dir_files = os.listdir('dl/bills')

 for f in dir_files:

    #split the filename into the congress and bill number to generate link to govtrack
    m = reg.match(f.replace('.xml',''))
    link = m.groups()
    (congress,billnum) = link

    #if there's another file representing the passed version of the bill, skip this one    
    if f.endswith('ih.xml'): 
        if '%s%seh.xml' % (congress,billnum) in dir_files:
            continue
        if '%s%srh.xml' % (congress,billnum) in dir_files:
            continue
    elif f.endswith('is.xml'): 
        if '%s%ses.xml' % (congress,billnum) in dir_files:
            continue
        if '%s%srs.xml' % (congress,billnum) in dir_files:
            continue
    if f.endswith('ih.xml'): 
        if '%s%seh.xml' % (congress,billnum) in dir_files:
            continue
        if '%s%srh.xml' % (congress,billnum) in dir_files:
            continue
    elif f.endswith('is.xml'): 
        if '%s%ses.xml' % (congress,billnum) in dir_files:
            continue
        if '%s%srs.xml' % (congress,billnum) in dir_files:
            continue


    s = removeNonAscii(open('dl/bills/'+f,'r').read())
    x = BeautifulSoup(s,"xml")


    title = x.find('title')  
    try:
        title = title.string
    except:
        pass
    if not title:
        if x.find('short-title'):
            title = x.find('short-title').string
        elif x.find('official-title'):
            title = x.find('official-title').string

    
    a3 = [] #there can be man appropriations in a bill; this is a list of them
    amount = 0 #the running tally of total cost
    indefinite = False #does "such sums as necessary" appear anywhere? if so, that changes our dollar amount

    for a in x.findAll('funds-and-year'):
        print a
        if a.string: a3.append(a.string)
        if a.get('amount'):
            if a['amount']=='indefinite':
                indefinite=True
            else:
                try:
                    amount += int(a['amount'])    
                except:    
                    print a, 'amount couldnt be parsed as int'
    try:
        introdate = x.find('date').string
    except:
        introdate = None        
        
    lastaction = []
    for a in x.findAll('action-date'):
        try:
            lastaction.append(a['date'])
        except:
            pass

    if x.find('attestation-date') and x.find('attestation-date').get('date') and len(x.find('attestation-date').get('date'))==8: 
        #special case for getting date from passed bills
        #="20130730" chamber="House">Passed the House of Representatives July 30, 2013.</attestation-date>
        lastaction.append( x.find('attestation-date').get('date') )

    
    lastaction.sort()
    if len(lastaction)>0: 
        actiondate = lastaction[-1]
        try:
            actiondate = actiondate[:4]+'-'+actiondate[4:6]+'-'+actiondate[6:]    
        except:
            pass
    else:
        actiondate = None
    actionall = ';'.join(lastaction)


    status = None
    try:
        status = x.resolution['resolution-stage']
    except:
        try:
            status = x.bill['bill-stage']
            status = status.replace('-',' ')
        except:
            pass
        pass

    purpose = x.find('property',{'name':'purpose'})
    if purpose: purpose = purpose.string

    (sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x)
    
    if not sponsor_name and (f.endswith('es.xml') or f.endswith('eh.xml')):
        #passed bills dont include sponsors, so open up the file showing its earlier version to grab it
        earlier_version = None
        if f.endswith('eh.xml'): 
            status = 'Passed the House'
            if '%s%sih.xml' % (congress,billnum) in dir_files:
                earlier_version = '%s%sih.xml' % (congress,billnum) 
            elif '%s%srh.xml' % (congress,billnum) in dir_files:
                earlier_version = '%s%srh.xml' % (congress,billnum) 
        elif f.endswith('is.xml'): 
            status = 'Passed the Senate'
            if '%s%sis.xml' % (congress,billnum) in dir_files:
                earlier_version = '%s%sis.xml' % (congress,billnum) 
            elif '%s%srs.xml' % (congress,billnum) in dir_files:
                earlier_version = '%s%srs.xml' % (congress,billnum) 
        if earlier_version:
            s_earlier = removeNonAscii(open('dl/bills/'+earlier_version,'r').read())
            x_earlier = BeautifulSoup(s_earlier,"xml")
            (sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x_earlier)
        

    #only list bills with money
    if amount>0 or indefinite: 
        descrip = '; '.join(a3)

        if indefinite:
           if amount==0:
               display_amount = 'such sums as necessary';
           else:
               display_amount = '$'+commafy(amount) + ' + such sums as necessary';
        else:
           display_amount = '$'+commafy(amount);
           
        
        d = {'id':f,'title':title,'funds-and-year':descrip,'amount':amount, 'indefinite': indefinite, 'display_amount': display_amount, 'sponsorid':sponsor_id, 'sponsorname':sponsor_name, 'status':status, 'actiondate':actiondate,'introdate': introdate, 'actionall': actionall, 'purpose': purpose, 'congress': congress, 'billnum': billnum, 'govtrack': govtrack, 'state': state}
        billlist.append(d)


 #order them by how recently anything has happened with them
 billlist.sort(key=lambda k: k['actiondate'],reverse=True)



 fshort = open('output/short.json','w')
 fshort.write('catoshort(' + json.dumps(billlist[:5]) + ');')
 fshort.close()


 flong = open('output/long.json','w')
 flong.write('cato(' + json.dumps(billlist) + ');')
 flong.close()
	import os
	import json
	import re
	import csv

	from bs4 import BeautifulSoup


	#format numbers
	def commafy(x):
	if type(x) not in [type(0), type(0L)]:
	raise TypeError("Parameter must be an integer.")
	if x < 0:
	return '-' + intWithCommas(-x)
	result = ''
	while x >= 1000:
	x, r = divmod(x, 1000)
	result = ",%03d%s" % (r, result)
	return "%d%s" % (x, result)


	#use govtrack's bioguide spreadheet to get state and formal name of lawmakers. we need two lookup tables because they could use 1 of 2 kinds of ids
	#the csv comes from https://github.com/unitedstates/congress-legislators
	#i have a simple pythons script that converts current_legislators from yaml to csv
	bioguide = {}
	fbioguide = csv.DictReader(open('bioguide.csv','r'))
	for line in fbioguide:
	bioguide[line['bioguide']] = line

	bioguide_lis = {}
	fbioguide = csv.DictReader(open('bioguide.csv','r'))
	for line in fbioguide:
	bioguide_lis[line['lis']] = line


	def removeNonAscii(s): return "".join(filter(lambda x: ord(x)<128, s))

	def process_sponsor(x):
	if not x.find('sponsor'):
	return (None,None,None,None)

	(sponsor_name,sponsor_id) = (x.find('sponsor').string, x.find('sponsor')['name-id'])

	b = None
	if sponsor_id in bioguide:
	b = bioguide[sponsor_id]
	elif sponsor_id in bioguide_lis:
	b = bioguide_lis[sponsor_id]
	if b:
	govtrack = b['govtrack']
	state = b['state']
	sponsor_name = b['term_type'].title() + '. ' + b['official_full'] + ' (' + b['party'][0].upper() + '-' + state + ')'
	sponsor_name = removeNonAscii(sponsor_name)
	else:
	govtrack = ''
	state = ''

	return (sponsor_name,sponsor_id,govtrack,state)

	os.system('mkdir output')
	os.system('rm dl -rf')
	os.system('mkdir dl')
	os.system('wget http://deepbills.cato.org/download -O dl/dl.zip')
	os.system('unzip dl/dl.zip -d dl')
	os.system('chmod 755 -R dl') #weird permissions issue with Cato ZIP file

	billlist = []

	reg = re.compile('(\d+)(\w+\d+)')


	dir_files = os.listdir('dl/bills')

	for f in dir_files:

	#split the filename into the congress and bill number to generate link to govtrack
	m = reg.match(f.replace('.xml',''))
	link = m.groups()
	(congress,billnum) = link

	#if there's another file representing the passed version of the bill, skip this one
	if f.endswith('ih.xml'):
	if '%s%seh.xml' % (congress,billnum) in dir_files:
	continue
	if '%s%srh.xml' % (congress,billnum) in dir_files:
	continue
	elif f.endswith('is.xml'):
	if '%s%ses.xml' % (congress,billnum) in dir_files:
	continue
	if '%s%srs.xml' % (congress,billnum) in dir_files:
	continue
	if f.endswith('ih.xml'):
	if '%s%seh.xml' % (congress,billnum) in dir_files:
	continue
	if '%s%srh.xml' % (congress,billnum) in dir_files:
	continue
	elif f.endswith('is.xml'):
	if '%s%ses.xml' % (congress,billnum) in dir_files:
	continue
	if '%s%srs.xml' % (congress,billnum) in dir_files:
	continue


	s = removeNonAscii(open('dl/bills/'+f,'r').read())
	x = BeautifulSoup(s,"xml")


	title = x.find('title')
	try:
	title = title.string
	except:
	pass
	if not title:
	if x.find('short-title'):
	title = x.find('short-title').string
	elif x.find('official-title'):
	title = x.find('official-title').string


	a3 = [] #there can be man appropriations in a bill; this is a list of them
	amount = 0 #the running tally of total cost
	indefinite = False #does "such sums as necessary" appear anywhere? if so, that changes our dollar amount

	for a in x.findAll('funds-and-year'):
	print a
	if a.string: a3.append(a.string)
	if a.get('amount'):
	if a['amount']=='indefinite':
	indefinite=True
	else:
	try:
	amount += int(a['amount'])
	except:
	print a, 'amount couldnt be parsed as int'
	try:
	introdate = x.find('date').string
	except:
	introdate = None

	lastaction = []
	for a in x.findAll('action-date'):
	try:
	lastaction.append(a['date'])
	except:
	pass

	if x.find('attestation-date') and x.find('attestation-date').get('date') and len(x.find('attestation-date').get('date'))==8:
	#special case for getting date from passed bills
	#="20130730" chamber="House">Passed the House of Representatives July 30, 2013.</attestation-date>
	lastaction.append( x.find('attestation-date').get('date') )


	lastaction.sort()
	if len(lastaction)>0:
	actiondate = lastaction[-1]
	try:
	actiondate = actiondate[:4]+'-'+actiondate[4:6]+'-'+actiondate[6:]
	except:
	pass
	else:
	actiondate = None
	actionall = ';'.join(lastaction)


	status = None
	try:
	status = x.resolution['resolution-stage']
	except:
	try:
	status = x.bill['bill-stage']
	status = status.replace('-',' ')
	except:
	pass
	pass

	purpose = x.find('property',{'name':'purpose'})
	if purpose: purpose = purpose.string

	(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x)

	if not sponsor_name and (f.endswith('es.xml') or f.endswith('eh.xml')):
	#passed bills dont include sponsors, so open up the file showing its earlier version to grab it
	earlier_version = None
	if f.endswith('eh.xml'):
	status = 'Passed the House'
	if '%s%sih.xml' % (congress,billnum) in dir_files:
	earlier_version = '%s%sih.xml' % (congress,billnum)
	elif '%s%srh.xml' % (congress,billnum) in dir_files:
	earlier_version = '%s%srh.xml' % (congress,billnum)
	elif f.endswith('is.xml'):
	status = 'Passed the Senate'
	if '%s%sis.xml' % (congress,billnum) in dir_files:
	earlier_version = '%s%sis.xml' % (congress,billnum)
	elif '%s%srs.xml' % (congress,billnum) in dir_files:
	earlier_version = '%s%srs.xml' % (congress,billnum)
	if earlier_version:
	s_earlier = removeNonAscii(open('dl/bills/'+earlier_version,'r').read())
	x_earlier = BeautifulSoup(s_earlier,"xml")
	(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x_earlier)


	#only list bills with money
	if amount>0 or indefinite:
	descrip = '; '.join(a3)

	if indefinite:
	if amount==0:
	display_amount = 'such sums as necessary';
	else:
	display_amount = '$'+commafy(amount) + ' + such sums as necessary';
	else:
	display_amount = '$'+commafy(amount);


	d = {'id':f,'title':title,'funds-and-year':descrip,'amount':amount, 'indefinite': indefinite, 'display_amount': display_amount, 'sponsorid':sponsor_id, 'sponsorname':sponsor_name, 'status':status, 'actiondate':actiondate,'introdate': introdate, 'actionall': actionall, 'purpose': purpose, 'congress': congress, 'billnum': billnum, 'govtrack': govtrack, 'state': state}
	billlist.append(d)


	#order them by how recently anything has happened with them
	billlist.sort(key=lambda k: k['actiondate'],reverse=True)



	fshort = open('output/short.json','w')
	fshort.write('catoshort(' + json.dumps(billlist[:5]) + ');')
	fshort.close()


	flong = open('output/long.json','w')
	flong.write('cato(' + json.dumps(billlist) + ');')
	flong.close()
No results found