Scrape MDEQ Asbestos Notifications
import requests
import urllib
from bs4 import BeautifulSoup
from urllib.parse import urlparse
#Initial request
r = requests.get('')
#Parse out nasty form stuff
soup = BeautifulSoup(r.text, 'html.parser')
hidden_field_token = 'ctl00_BodyContent_smAjax_HiddenField'
parsed_hidden_field_key = '_TSM_CombinedScripts_'
for script in soup.find_all('script'):
src = script.get('src')
if src != None and hidden_field_token in src:
o = urlparse(src)
qs = urllib.parse.parse_qs(o.query)
payload = {}
wayne_county_code = '82'
city_name = 'Detroit'
county_key = 'ctl00$BodyContent$ddlCounty'
city_key = 'ctl00$BodyContent$txtCity'
clear_key = 'ctl00$BodyContent$_btnClear'
for inp in soup.find_all('input'):
name = inp.get('name')
value = inp.get('value') if inp.get('value') != None else ''
payload[name] = value
#Setup form object to be sent to their servers and hard code the county code for wayne and the city of detroit
payload[county_key] = wayne_county_code
payload[city_key] = city_name
payload[hidden_field_token] = qs[parsed_hidden_field_key][0]
#send a POST request
req_url = ''
second_r =, data=payload)
parsed_data_page = BeautifulSoup(second_r.text, 'html.parser')
#scraped_data = parsed_data_page.find_all('div', attrs={'id': 'divPrint'})
import pandas as pd
import requests as r
import urllib.parse as u
df = pd.read_csv('deq_asbestos_notifications.csv')
request_url = '{}&ZIP=&Single+Line+Input=&category=&outFields=*&maxLocations=&outSR=4326&searchExtent=&location=&distance=&magicKey=&f=pjson'
def geocode_address(addr):
qs = {'Street': addr}
encoded_qs = u.urlencode(qs)
res = r.get(request_url.format(encoded_qs))
data_dict = res.json()
#take the top candidate for now for the sake of greed
data_dict_candidates = data_dict['candidates']
if len(data_dict_candidates) > 0:
first_candidate = data_dict_candidates[0]
first_candidate_location = first_candidate['location']
first_candidate_attributes = first_candidate['attributes']
return pd.Series({'Parcel': first_candidate_attributes['User_fld'].strip(), 'Score': first_candidate_attributes['Score'], 'Long':first_candidate_location['x'], 'Lat': first_candidate_location['y']})
return pd.Series({'Parcel': 'Unknown', 'Score': 0})
#Get all unique addresses in the entries for asbestos notifs and geocode them
uniq_addrs = df.Address.unique()
addrs = {'Address': uniq_addrs}
addr_df = pd.DataFrame(data=addrs)
#geocode_slice = addr_df[:20]
geocode_result = addr_df.apply(lambda data_row: geocode_address(data_row['Address']), axis=1)
merged = geocode_slice.merge(geocode_result, left_index=True, right_index=True)
#This will find not perfectly matched geocode results
#unmatched_gecodes = merged.query('Score < 100.00')
#Join geocoded addresses to the originial dataframe
joined_df = df.set_index('Address').join(merged.set_index('Address'))
queried_df = joined_df.query('Score > 0')
bad_addresses = joined_df.query('Score == 0')
