Created
August 25, 2013 18:08
-
-
Save jwass/6335354 to your computer and use it in GitHub Desktop.
Scrape MA state senate/representative contact info and merge it with the legislator's property info.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import difflib | |
import json | |
import re | |
import bs4 | |
import pandas as pd | |
import requests | |
regex = re.compile('\((.*)\)') | |
def parse_contact_info(chamber): | |
if chamber == 'senate': | |
url = 'https://malegislature.gov/People/Senate' | |
else: | |
url = 'https://malegislature.gov/People/House' | |
r = requests.get(url) | |
# I really should be able to do pd.read_html(r.content) | |
# but for some reason it doesn't find the table so use bs4 to | |
# tease it out | |
soup = bs4.BeautifulSoup(r.content) | |
df = pd.read_html(soup.find('table').prettify(), infer_types=False)[0] | |
df.set_index(df['Name'].str.findall(regex).str[0].str.lower(), | |
inplace=True) | |
df = df[['Phone Number', 'Email Address']] | |
m = df.T.to_dict() | |
return m | |
def run_chamber(chamber): | |
contact = parse_contact_info(chamber) | |
if chamber == 'senate': | |
name_field = 'SENATOR' | |
filename = 'properties/senate.json' | |
else: | |
name_field = 'REP' | |
filename = 'properties/house.json' | |
with open(filename) as f: | |
props = json.loads(f.read()) | |
matches = contact.keys() | |
for k, p in props.iteritems(): | |
name = p[name_field] | |
if name == 'OPEN': | |
continue | |
name = name.replace(' (D)', '') | |
name = name.replace(' (R)', '') | |
name = name.lower() | |
# Use difflib to find the 'close' name matches. This helps greatly in | |
# places where there are middle initials, apostrophes, unicode | |
# characters and other stuff that would screw up an exact match | |
m = difflib.get_close_matches(name, matches, n=1, cutoff=0.63) | |
if not m: | |
print('No match for {}'.format(name)) | |
continue | |
record = contact[m[0]] | |
p['PHONE'] = record['Phone Number'] | |
p['EMAIL'] = record['Email Address'] | |
with open(filename, 'w') as f: | |
f.write(json.dumps(props, indent=4, separators=(',', ': '))) | |
def main(): | |
run_chamber('senate') | |
run_chamber('house') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment