Skip to content

Instantly share code, notes, and snippets.

@mdobson
Last active June 22, 2018 18:34
Show Gist options
  • Save mdobson/240a2e311d17c99fe4aeff78f50a08c2 to your computer and use it in GitHub Desktop.
Save mdobson/240a2e311d17c99fe4aeff78f50a08c2 to your computer and use it in GitHub Desktop.
Scrape MDEQ Asbestos Notifications
import requests
import urllib
from bs4 import BeautifulSoup
from urllib.parse import urlparse
#Initial request
r = requests.get('http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx')
#Parse out nasty asp.net form stuff
soup = BeautifulSoup(r.text, 'html.parser')
#print(soup)
hidden_field_token = 'ctl00_BodyContent_smAjax_HiddenField'
parsed_hidden_field_key = '_TSM_CombinedScripts_'
for script in soup.find_all('script'):
src = script.get('src')
if src != None and hidden_field_token in src:
o = urlparse(src)
qs = urllib.parse.parse_qs(o.query)
#print(qs)
payload = {}
wayne_county_code = '82'
city_name = 'Detroit'
county_key = 'ctl00$BodyContent$ddlCounty'
city_key = 'ctl00$BodyContent$txtCity'
clear_key = 'ctl00$BodyContent$_btnClear'
for inp in soup.find_all('input'):
name = inp.get('name')
value = inp.get('value') if inp.get('value') != None else ''
payload[name] = value
#Setup form object to be sent to their servers and hard code the county code for wayne and the city of detroit
payload[county_key] = wayne_county_code
payload[city_key] = city_name
payload[hidden_field_token] = qs[parsed_hidden_field_key][0]
payload.pop(clear_key)
#print(payload)
#send a POST request
req_url = 'http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx'
second_r = requests.post(req_url, data=payload)
#print(second_r.status_code)
parsed_data_page = BeautifulSoup(second_r.text, 'html.parser')
print(parsed_data_page)
#scraped_data = parsed_data_page.find_all('div', attrs={'id': 'divPrint'})
import pandas as pd
import requests as r
import urllib.parse as u
df = pd.read_csv('deq_asbestos_notifications.csv')
request_url = 'https://gis.detroitmi.gov/arcgis/rest/services/DoIT/AddressPointGeocoder/GeocodeServer/findAddressCandidates?{}&ZIP=&Single+Line+Input=&category=&outFields=*&maxLocations=&outSR=4326&searchExtent=&location=&distance=&magicKey=&f=pjson'
def geocode_address(addr):
qs = {'Street': addr}
encoded_qs = u.urlencode(qs)
res = r.get(request_url.format(encoded_qs))
data_dict = res.json()
#take the top candidate for now for the sake of greed
data_dict_candidates = data_dict['candidates']
if len(data_dict_candidates) > 0:
first_candidate = data_dict_candidates[0]
first_candidate_location = first_candidate['location']
first_candidate_attributes = first_candidate['attributes']
return pd.Series({'Parcel': first_candidate_attributes['User_fld'].strip(), 'Score': first_candidate_attributes['Score'], 'Long':first_candidate_location['x'], 'Lat': first_candidate_location['y']})
else:
return pd.Series({'Parcel': 'Unknown', 'Score': 0})
#Get all unique addresses in the entries for asbestos notifs and geocode them
uniq_addrs = df.Address.unique()
addrs = {'Address': uniq_addrs}
addr_df = pd.DataFrame(data=addrs)
#geocode_slice = addr_df[:20]
geocode_result = addr_df.apply(lambda data_row: geocode_address(data_row['Address']), axis=1)
merged = geocode_slice.merge(geocode_result, left_index=True, right_index=True)
#This will find not perfectly matched geocode results
#unmatched_gecodes = merged.query('Score < 100.00')
#Join geocoded addresses to the originial dataframe
joined_df = df.set_index('Address').join(merged.set_index('Address'))
queried_df = joined_df.query('Score > 0')
bad_addresses = joined_df.query('Score == 0')
queried_df.to_csv('deq_asbestos_notifications_with_geocodes.csv')
bad_addresses.to_csv('deq_asbestos_notifications_anamalous_addresses.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment