-
-
Save mdobson/240a2e311d17c99fe4aeff78f50a08c2 to your computer and use it in GitHub Desktop.
Scrape MDEQ Asbestos Notifications
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import urllib | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
#Initial request | |
r = requests.get('http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx') | |
#Parse out nasty asp.net form stuff | |
soup = BeautifulSoup(r.text, 'html.parser') | |
#print(soup) | |
hidden_field_token = 'ctl00_BodyContent_smAjax_HiddenField' | |
parsed_hidden_field_key = '_TSM_CombinedScripts_' | |
for script in soup.find_all('script'): | |
src = script.get('src') | |
if src != None and hidden_field_token in src: | |
o = urlparse(src) | |
qs = urllib.parse.parse_qs(o.query) | |
#print(qs) | |
payload = {} | |
wayne_county_code = '82' | |
city_name = 'Detroit' | |
county_key = 'ctl00$BodyContent$ddlCounty' | |
city_key = 'ctl00$BodyContent$txtCity' | |
clear_key = 'ctl00$BodyContent$_btnClear' | |
for inp in soup.find_all('input'): | |
name = inp.get('name') | |
value = inp.get('value') if inp.get('value') != None else '' | |
payload[name] = value | |
#Setup form object to be sent to their servers and hard code the county code for wayne and the city of detroit | |
payload[county_key] = wayne_county_code | |
payload[city_key] = city_name | |
payload[hidden_field_token] = qs[parsed_hidden_field_key][0] | |
payload.pop(clear_key) | |
#print(payload) | |
#send a POST request | |
req_url = 'http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx' | |
second_r = requests.post(req_url, data=payload) | |
#print(second_r.status_code) | |
parsed_data_page = BeautifulSoup(second_r.text, 'html.parser') | |
print(parsed_data_page) | |
#scraped_data = parsed_data_page.find_all('div', attrs={'id': 'divPrint'}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests as r | |
import urllib.parse as u | |
df = pd.read_csv('deq_asbestos_notifications.csv') | |
request_url = 'https://gis.detroitmi.gov/arcgis/rest/services/DoIT/AddressPointGeocoder/GeocodeServer/findAddressCandidates?{}&ZIP=&Single+Line+Input=&category=&outFields=*&maxLocations=&outSR=4326&searchExtent=&location=&distance=&magicKey=&f=pjson' | |
def geocode_address(addr): | |
qs = {'Street': addr} | |
encoded_qs = u.urlencode(qs) | |
res = r.get(request_url.format(encoded_qs)) | |
data_dict = res.json() | |
#take the top candidate for now for the sake of greed | |
data_dict_candidates = data_dict['candidates'] | |
if len(data_dict_candidates) > 0: | |
first_candidate = data_dict_candidates[0] | |
first_candidate_location = first_candidate['location'] | |
first_candidate_attributes = first_candidate['attributes'] | |
return pd.Series({'Parcel': first_candidate_attributes['User_fld'].strip(), 'Score': first_candidate_attributes['Score'], 'Long':first_candidate_location['x'], 'Lat': first_candidate_location['y']}) | |
else: | |
return pd.Series({'Parcel': 'Unknown', 'Score': 0}) | |
#Get all unique addresses in the entries for asbestos notifs and geocode them | |
uniq_addrs = df.Address.unique() | |
addrs = {'Address': uniq_addrs} | |
addr_df = pd.DataFrame(data=addrs) | |
#geocode_slice = addr_df[:20] | |
geocode_result = addr_df.apply(lambda data_row: geocode_address(data_row['Address']), axis=1) | |
merged = geocode_slice.merge(geocode_result, left_index=True, right_index=True) | |
#This will find not perfectly matched geocode results | |
#unmatched_gecodes = merged.query('Score < 100.00') | |
#Join geocoded addresses to the originial dataframe | |
joined_df = df.set_index('Address').join(merged.set_index('Address')) | |
queried_df = joined_df.query('Score > 0') | |
bad_addresses = joined_df.query('Score == 0') | |
queried_df.to_csv('deq_asbestos_notifications_with_geocodes.csv') | |
bad_addresses.to_csv('deq_asbestos_notifications_anamalous_addresses.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment