Last active
April 23, 2017 16:30
-
-
Save frenata/4c9c30520b0128f6ed180a5c62a8946b to your computer and use it in GitHub Desktop.
no metadata!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pupa.scrape import Person, Scraper | |
import lxml.html | |
import re | |
import pdb | |
class IAPersonScraper(Scraper): | |
jurisdiction = 'ia' | |
def scrape(self, chamber=None): | |
#pdb.set_trace() | |
session_id = self.metadata | |
#session_id = self.metadata['legislative_sessions'][session]['number'] | |
if chamber: | |
yield from self.scrape_chamber(chamber) | |
else: | |
yield from self.scrape_chamber('upper') | |
yield from self.scrape_chamber('lower') | |
def scrape_chamber(self, chamber): | |
url = "https://www.legis.iowa.gov/legislators/" | |
if chamber == "lower": | |
url += "house" | |
else: | |
url += "senate" | |
page = lxml.html.fromstring(self.get(url).text) | |
page.make_links_absolute(url) | |
table = page.xpath('//table[@id="sortableTable"]')[0] | |
for link in table.xpath(".//a[contains(@href, 'legislator')]"): | |
yield from self.scrape_member(chamber, link) | |
def scrape_member(self, chamber, link): | |
name = link.text.strip() | |
leg_url = link.get('href') | |
district = link.xpath("string(../../td[3])") | |
party = link.xpath("string(../../td[4])") | |
email = link.xpath("string(../../td[5])") | |
if party == 'Democrat': | |
party = 'Democratic' | |
pid = re.search("personID=(\d+)", link.attrib['href']).group(1) | |
photo_url = ("https://www.legis.iowa.gov/photo" | |
"?action=getPhoto&ga=%s&pid=%s" % (session_id, pid)) | |
leg = Person( | |
name=name, | |
primary_org=chamber, | |
district=district, | |
party=party, | |
image=photo_url) | |
leg.add_link(leg_url) | |
leg.add_source(leg_url) | |
leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text) | |
self.scrape_member_page(leg, leg_page) | |
yield leg | |
def scrape_member_page(self, leg, leg_page): | |
office_data = { | |
"Legislative E-mail:": "email", | |
"Home Phone:": "home_phone", | |
"Home Address:": "home_addr", | |
"Capitol Phone:": "office_phone", | |
} | |
metainf = {} | |
table, = leg_page.xpath( | |
"//div[@class='legisIndent divideVert']/table" | |
) | |
for row in table.xpath(".//tr"): | |
try: | |
key, value = ( | |
x.text_content().strip() for x in row.xpath("./td") | |
) | |
except ValueError: | |
continue | |
try: | |
metainf[office_data[key]] = value | |
except KeyError: | |
continue | |
if "home_phone" in metainf: | |
leg.add_contact_detail(type='voice', | |
value=metainf['home_phone'], | |
note='District Office') | |
if "home_addr" in metainf: | |
leg.add_contact_detail(type='address', | |
value=metainf['home_addr'], | |
note='District Office') | |
if "email" in metainf: | |
leg.add_contact_detail(type='email', | |
value=metainf['email'], | |
note='Capitol Office') | |
if "office_phone" in metainf: | |
leg.add_contact_detail(type='voice', | |
value=metainf['office_phone'], | |
note='Capitol Office') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pupa.scrape import Jurisdiction, Organization | |
from .people import IAPersonScraper | |
from .bills import IABillScraper | |
class Iowa(Jurisdiction): | |
division_id = "ocd-division/country:us/state:ia" | |
classification = "government" | |
name = "Iowa" | |
url = "https://www.legis.iowa.gov/" | |
scrapers = { | |
'people': IAPersonScraper, | |
#'bills': IABillScraper | |
} | |
parties = [ | |
{'name': 'Republican'}, | |
{'name': 'Democratic'} | |
] | |
legislative_sessions = [ | |
{ | |
"_scraped_name": "General Assembly: 84", | |
"end_date": "2013-01-13", | |
"identifier": "2011-2012", | |
"number": "84", | |
"name": "2011-2012 Regular Session", | |
"start_date": "2011-01-10", | |
}, | |
{ | |
"_scraped_name": "General Assembly: 85", | |
"identifier": "2013-2014", | |
"number": "85", | |
"name": "2013-2014 Regular Session", | |
}, | |
{ | |
"_scraped_name": "General Assembly: 86", | |
"identifier": "2015-2016", | |
"number": "86", | |
"name": "2015-2016 Regular Session", | |
}, | |
{ | |
"_scraped_name": "General Assembly: 87", | |
"identifier": "2017-2018", | |
"number": "87", | |
"name": "2017-2018 Regular Session", | |
} | |
] | |
ignored_scraped_sessions = [ | |
"Legislative Assembly: 86", | |
"General Assembly: 83", | |
"General Assembly: 82", | |
"General Assembly: 81", | |
"General Assembly: 80", | |
"General Assembly: 79", | |
"General Assembly: 79", | |
"General Assembly: 78", | |
"General Assembly: 78", | |
"General Assembly: 77", | |
"General Assembly: 77", | |
"General Assembly: 76" | |
] | |
def get_organizations(self): | |
legislature_name = "Iowa General Assembly" | |
lower_chamber_name = "House" | |
lower_seats = 100 | |
lower_title = "Representative" | |
upper_chamber_name = "Senate" | |
upper_seats = 50 | |
upper_title = "Senator" | |
legislature = Organization(name=legislature_name, | |
classification="legislature") | |
upper = Organization(upper_chamber_name, classification='upper', | |
parent_id=legislature._id) | |
lower = Organization(lower_chamber_name, classification='lower', | |
parent_id=legislature._id) | |
for n in range(1, upper_seats + 1): | |
upper.add_post( | |
label=str(n), role=upper_title, | |
division_id='{}/sldu:{}'.format(self.division_id, n)) | |
for n in range(1, lower_seats + 1): | |
lower.add_post( | |
label=str(n), role=lower_title, | |
division_id='{}/sldl:{}'.format(self.division_id, n)) | |
yield legislature | |
yield upper | |
yield lower |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment