Created
May 26, 2011 21:15
-
-
Save jpwatts/994100 to your computer and use it in GitHub Desktop.
Scraper for Houston's active incident report
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Scraper for Houston's active incident report""" | |
import collections | |
import csv | |
import operator | |
import sys | |
import dateutil.parser | |
import httplib2 | |
import pyquery | |
__all__ = ['CACHE', 'URL', 'ActiveIncident', 'scrape', 'dump'] | |
CACHE = None | |
URL = u"http://cbtcws.cityofhouston.gov/ActiveIncidents/Combined.aspx" | |
ActiveIncident = collections.namedtuple('ActiveIncident', 'agency address cross_street key_map call_time incident_type combined_response') | |
def _scrape(): | |
_response, html = httplib2.Http(CACHE).request(URL) | |
rows = pyquery.PyQuery(html)('#dgResults tr')[1:] | |
for r in rows: | |
cols = [td.text_content().strip().upper() for td in pyquery.PyQuery('td', r)] | |
if not cols[1]: | |
continue # Skip records without an address. | |
try: | |
cols[4] = dateutil.parser.parse(cols[4]) | |
except ValueError: | |
continue # Skip records with an invalid call time. | |
if not cols[5]: | |
continue # Skip records without an incident type. | |
cols[6] = cols[6] == 'Y' | |
yield ActiveIncident(*cols) | |
def scrape(): | |
"""Return a list of active incidents.""" | |
return sorted(_scrape(), key=operator.attrgetter('call_time'), reverse=True) | |
def dump(f): | |
"""Write a CSV file containing a list of active incidents.""" | |
writer = csv.writer(f) | |
writer.writerow([ | |
u"Agency (FD/PD)", | |
u"Address", | |
u"Cross Street", | |
u"Key Map", | |
u"Call Time (Opened)", | |
u"Incident Type", | |
u"Combined Response (Y/N)" | |
]) | |
writer.writerows(scrape()) | |
if __name__ == '__main__': | |
dump(sys.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment