-
-
Save paulsmith/233068 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib, re | |
from pprint import pprint as pp | |
from datetime import datetime, timedelta | |
from geopy.geocoders import Google | |
import MySQLdb | |
from time import sleep | |
bcpd_arc_url = "http://maps.baltimorepolice.org/servlet/com.esri.esrimap.Esrimap?ServiceName=BaltimorePolice&CustomService=Query&ClientVersion=3.1&Form=True&Encode=False" | |
districts={"Northwest": | |
"""<POINT x="1393926.78398081" y="621171.476883567" /> | |
<POINT x="1397217.83" y="621171.476883567" /> | |
<POINT x="1397217.83" y="616180.92" /> | |
<POINT x="1393926.78398081" y="616180.92" />""", | |
"Northeast": | |
"""<POINT x="1442316.71979371" y="621406.8125" /> | |
<POINT x="1445310.91155006" y="621406.8125" /> | |
<POINT x="1445310.91155006" y="618412.039570119" /> | |
<POINT x="1442316.71979371" y="618412.039570119" />""", | |
"West": | |
"""<POINT x="1394077.54114138" y="592162.660878078" /> | |
<POINT x="1397806.49013248" y="592162.660878078" /> | |
<POINT x="1397806.49013248" y="586906.339042239" /> | |
<POINT x="1394077.54114138" y="586906.339042239" />""", | |
"East": | |
"""<POINT x="1442360.22501292" y="590833.205731698" /> | |
<POINT x="1445457.16335056" y="590833.205731698" /> | |
<POINT x="1445457.16335056" y="584158.004589979" /> | |
<POINT x="1442360.22501292" y="584158.004589979" />""", | |
"South": | |
"""<POINT x="1428897.34236402" y="565462.027408424" /> | |
<POINT x="1444872.773" y="565462.027408424" /> | |
<POINT x="1444872.773" y="557733.625" /> | |
<POINT x="1428897.34236402" y="557733.625" />""", | |
"Whole City": | |
"""<POINT x="1393926.78398081" y="621406.8125" /> | |
<POINT x="1442360.22501292" y="621406.8125" /> | |
<POINT x="1442360.22501292" y="557733.625" /> | |
<POINT x="1393926.78398081" y="557733.625" />""", | |
} | |
#XXX: note the extra "> right before SPATIALFILTER... sigh. | |
def requestXML(dt): | |
return """<?xml version="1.0" encoding="UTF-8" ?><ARCXML version="1.1"> | |
<REQUEST> | |
<GET_FEATURES outputmode="xml" envelope="true" geometry="false" featurelimit="1000" beginrecord="1"> | |
<LAYER id="6" /><SPATIALQUERY subfields="CCNO CRIME_DESC FROM_DATE PREM_DESC LOCATION POST DISTRICT" where="(FROM_DATE >= {ts '%s'}) AND (FROM_DATE <= {ts '%s'})"> "><SPATIALFILTER relation="area_intersection" ><POLYGON> | |
<RING> | |
%s | |
</RING> | |
</POLYGON> | |
</SPATIALFILTER></SPATIALQUERY></GET_FEATURES></REQUEST></ARCXML>""" % \ | |
(dt.isoformat(' '), (dt + timedelta(1)).isoformat(' '), districts["Whole City"]) | |
#return a list of tuples of (ccno, crime type, premises, time, address, post (?), district) | |
def scrape(dt): | |
postdata = urllib.urlencode({'ArcXMLRequest': requestXML(dt)}) | |
d = urllib.urlopen(bcpd_arc_url, postdata).read() | |
xml = re.search("(<?xml.*)';", d).groups()[0] | |
#skip actual parsing of the xml, for now. KISS | |
fs = re.findall("(FIELDS.*?)>", xml) | |
ts = [list(re.search('FIELDS CCNO="(.*?)" CRIME_DESC="(.*?)" PREM_DESC="(.*?)" FROM_DATE="(.*?)" LOCATION="(.*?)" POST="(.*?)" DISTRICT="(.*?)"', f).groups()) for f in fs] | |
for t in ts: | |
#time is given in milliseconds since unix time start; we want seconds | |
t[3] = datetime.fromtimestamp(float(t[3])/1000) | |
return [tuple(t) for t in ts] | |
#s = scrape(datetime(2009,2,10)) | |
#pp((s, len(s))) | |
#geocode the addys | |
def date_range(dt1, dt2): | |
if dt1 > dt2: | |
raise Exception("invalid date range") | |
while dt1 <= dt2: | |
yield dt1 | |
dt1 += timedelta(days=1) | |
g = Google("#XXX: google maps api key"#) | |
def geocode(addr): | |
for _, coords in g.geocode(addr + " baltimore, md", False): | |
return coords | |
conn = MySQLdb.connect(host=#host, user=#user, db=#db_name) | |
notfound = [] | |
for d in date_range(datetime(2009,3,10), datetime(2009,5,12)): | |
print d | |
for case_number, crime, premises, dt, addr, post, district in scrape(d): | |
latlong = geocode(addr) | |
if not latlong: | |
sleep(2) | |
latlong = geocode(addr) | |
if not latlong: | |
notfound.append(addr) | |
continue | |
lat, long = latlong | |
cur = conn.cursor() | |
cur.execute(""" | |
INSERT INTO crime (city_id, case_number, crime, location, address, | |
reported_on, latitude, longitude) | |
VALUES (1,%s,%s,%s,%s,%s,%s,%s)""" , ( | |
case_number, crime, premises, addr, dt, lat, long) | |
) | |
cur.close() | |
conn.commit() | |
print "not found addresses:" | |
pp(notfound) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment