Last active
December 28, 2015 17:59
-
-
Save erochest/7539962 to your computer and use it in GitHub Desktop.
A script for scraping enrollment race data from the MD Report Card site.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Dependencies: | |
# - pip install lxml | |
# - pip install cssselect | |
# - pip install requests | |
import collections | |
import csv | |
import re | |
import sys | |
from urlparse import parse_qs, urljoin, urlparse | |
import requests | |
import lxml.etree | |
import lxml.html | |
ROOT = 'http://www.mdreportcard.org/rcounty.aspx?WDATA=School' | |
### Data types | |
RACE_FIELDS = [ | |
'am_ind', 'asian', 'afam', 'hispanic', 'hi', 'white', 'multi', | |
] | |
SCHOOL_FIELDS = ( | |
['county', 'county_code', 'name', 'code', 'year', 'total'] + RACE_FIELDS | |
) | |
YEARS = {'2011', '2012', '2013'} | |
InfoLink = collections.namedtuple('InfoLink', ['info', 'href']) | |
SchoolInfo = collections.namedtuple('SchoolInfo', SCHOOL_FIELDS) | |
def get_page(url): | |
"""Download the page using GET and return the document.""" | |
r = requests.get(url) | |
return lxml.html.document_fromstring(r.text) | |
def get_text_of(parent, css, f, sep=''): | |
"""\ | |
This gets the elements for the CSS selector, calls f to get the text, and | |
joins it all together. | |
""" | |
return sep.join( | |
f(node) for node in parent.cssselect(css) | |
).strip() | |
def get_schools(url=ROOT): | |
"""This walks the root page and returns NamedLinks for each school.""" | |
root = get_page(url) | |
for td in root.cssselect('.countySchoolTable td'): | |
county_name = get_text_of(td, '.countyBox', lambda n: n.text) | |
link = get_text_of(td, '.demographicsBox a', lambda n: n.get('href')) | |
county_url = urljoin(url, link) | |
county_page = get_page(county_url) | |
for a in county_page.cssselect('div.schoolBox a'): | |
info = SchoolInfo( | |
county_name, | |
None, | |
a.text.strip(), | |
re.search(r'\d+', a.tail).group(), | |
None, | |
None, | |
None, | |
None, | |
None, | |
None, | |
None, | |
None, | |
None, | |
) | |
school_url = urljoin(county_url, a.get('href')) | |
yield InfoLink(info, school_url) | |
def load_info(info_link, years): | |
"""\ | |
This takes an unpopulated InfoLink, gets the page and updates the data. | |
""" | |
index = get_page(info_link.href) | |
for a in index.cssselect('a'): | |
if a.text and a.text.strip() == 'Enrollment': | |
info = info_link.info | |
url = urljoin(info_link.href, a.get('href')) | |
parts = urlparse(url) | |
query = parse_qs(parts.query) | |
pv = query.get('PV') | |
if pv is None: | |
sys.stderr.write('Invalid URL: {}\n'.format(url)) | |
continue | |
county_code = pv[0].split(':')[2] | |
year_infos = get_enrollment( | |
url, info._replace(county_code=county_code), years, | |
) | |
for year_info in year_infos: | |
yield year_info | |
def get_enrollment(url, info, years): | |
"""\ | |
This loads all of the enrollment data. The URL is the main enrollment URL | |
linked to from the School's site. | |
""" | |
page = get_page(url) | |
totals = {} | |
for table in page.cssselect('table.datatable'): | |
totals.update(read_enrollment(table)) | |
race_totals = {} | |
for ul in page.cssselect('#raceList'): | |
for a in ul.cssselect('li a'): | |
if a.text and a.text.strip().startswith('All Races'): | |
race_url = urljoin(url, a.get('href')) | |
race_totals = load_race_totals(get_page(race_url)) | |
break | |
for year in years: | |
if year in totals: | |
total = totals[year] | |
rtotals = race_totals[year] | |
yield info._replace( | |
year = year, | |
total = total, | |
am_ind = rtotals.get('am_ind'), | |
asian = rtotals.get('asian'), | |
afam = rtotals.get('afam'), | |
hispanic = rtotals.get('hispanic'), | |
hi = rtotals.get('hi'), | |
white = rtotals.get('white'), | |
multi = rtotals.get('multi'), | |
) | |
def read_enrollment(table): | |
"""\ | |
This pulls the data for all years from the enrollment table and returns it | |
as a dict. | |
""" | |
totals = {} | |
for tbody in table.cssselect('tbody'): | |
for node in tbody.cssselect('tr.year th.label'): | |
year = node.text | |
for node in tbody.cssselect('tr td'): | |
if node.text and node.text.strip().isdigit(): | |
totals[year] = int(node.text) | |
break | |
return totals | |
def load_race_totals(page): | |
totals = {} | |
for table in page.cssselect('table.datatable'): | |
year = None | |
for tr in table.cssselect('tr'): | |
if tr.get('class') == 'year': | |
year = tr.cssselect('th')[0].text | |
continue | |
if year is None: | |
continue | |
tds = [ | |
td | |
for td in tr.cssselect('td') | |
if td.get('class') != 'label' | |
] | |
totals[year] = { | |
k: v.text | |
for (k, v) in zip(RACE_FIELDS, tds) | |
} | |
return totals | |
def main(): | |
writer = csv.writer(sys.stdout, dialect='excel-tab') | |
writer.writerow(SCHOOL_FIELDS) | |
for info in get_schools(): | |
writer.writerows( year_data for year_data in load_info(info, YEARS) ) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment