-
-
Save shanealynn/033c8a3cacdba8ce03cbe116225ced31 to your computer and use it in GitHub Desktop.
""" | |
Python script for batch geocoding of addresses using the Google Geocoding API. | |
This script allows for massive lists of addresses to be geocoded for free by pausing when the | |
geocoder hits the free rate limit set by Google (2500 per day). If you have an API key for paid | |
geocoding from Google, set it in the API key section. | |
Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses | |
come from a csv file with a column "Address". Adjust the code to your own requirements as needed. | |
After every 500 successul geocode operations, a temporary file with results is recorded in case of | |
script failure / loss of connection later. | |
Addresses and data are held in memory, so this script may need to be adjusted to process files line | |
by line if you are processing millions of entries. | |
Shane Lynn | |
5th November 2016 | |
""" | |
import pandas as pd | |
import requests | |
import logging | |
import time | |
logger = logging.getLogger("root") | |
logger.setLevel(logging.DEBUG) | |
# create console handler | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.DEBUG) | |
logger.addHandler(ch) | |
#------------------ CONFIGURATION ------------------------------- | |
# Set your Google API key here. | |
# Even if using the free 2500 queries a day, its worth getting an API key since the rate limit is 50 / second. | |
# With API_KEY = None, you will run into a 2 second delay every 10 requests or so. | |
# With a "Google Maps Geocoding API" key from https://console.developers.google.com/apis/, | |
# the daily limit will be 2500, but at a much faster rate. | |
# Example: API_KEY = 'AIzaSyC9azed9tLdjpZNjg2_kVePWvMIBq154eA' | |
API_KEY = None | |
# Backoff time sets how many minutes to wait between google pings when your API limit is hit | |
BACKOFF_TIME = 30 | |
# Set your output file name here. | |
output_filename = 'data/output-2015.csv' | |
# Set your input file here | |
input_filename = "data/PPR-2015.csv" | |
# Specify the column name in your input data that contains addresses here | |
address_column_name = "Address" | |
# Return Full Google Results? If True, full JSON results from Google are included in output | |
RETURN_FULL_RESULTS = False | |
#------------------ DATA LOADING -------------------------------- | |
# Read the data to a Pandas Dataframe | |
data = pd.read_csv(input_filename, encoding='utf8') | |
if address_column_name not in data.columns: | |
raise ValueError("Missing Address column in input data") | |
# Form a list of addresses for geocoding: | |
# Make a big list of all of the addresses to be processed. | |
addresses = data[address_column_name].tolist() | |
# **** DEMO DATA / IRELAND SPECIFIC! **** | |
# We know that these addresses are in Ireland, and there's a column for county, so add this for accuracy. | |
# (remove this line / alter for your own dataset) | |
addresses = (data[address_column_name] + ',' + data['County'] + ',Ireland').tolist() | |
#------------------ FUNCTION DEFINITIONS ------------------------ | |
def get_google_results(address, api_key=None, return_full_response=False): | |
""" | |
Get geocode results from Google Maps Geocoding API. | |
Note, that in the case of multiple google geocode reuslts, this function returns details of the FIRST result. | |
@param address: String address as accurate as possible. For Example "18 Grafton Street, Dublin, Ireland" | |
@param api_key: String API key if present from google. | |
If supplied, requests will use your allowance from the Google API. If not, you | |
will be limited to the free usage of 2500 requests per day. | |
@param return_full_response: Boolean to indicate if you'd like to return the full response from google. This | |
is useful if you'd like additional location details for storage or parsing later. | |
""" | |
# Set up your Geocoding url | |
geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address) | |
if api_key is not None: | |
geocode_url = geocode_url + "&key={}".format(api_key) | |
# Ping google for the reuslts: | |
results = requests.get(geocode_url) | |
# Results will be in JSON format - convert to dict using requests functionality | |
results = results.json() | |
# if there's no results or an error, return empty results. | |
if len(results['results']) == 0: | |
output = { | |
"formatted_address" : None, | |
"latitude": None, | |
"longitude": None, | |
"accuracy": None, | |
"google_place_id": None, | |
"type": None, | |
"postcode": None | |
} | |
else: | |
answer = results['results'][0] | |
output = { | |
"formatted_address" : answer.get('formatted_address'), | |
"latitude": answer.get('geometry').get('location').get('lat'), | |
"longitude": answer.get('geometry').get('location').get('lng'), | |
"accuracy": answer.get('geometry').get('location_type'), | |
"google_place_id": answer.get("place_id"), | |
"type": ",".join(answer.get('types')), | |
"postcode": ",".join([x['long_name'] for x in answer.get('address_components') | |
if 'postal_code' in x.get('types')]) | |
} | |
# Append some other details: | |
output['input_string'] = address | |
output['number_of_results'] = len(results['results']) | |
output['status'] = results.get('status') | |
if return_full_response is True: | |
output['response'] = results | |
return output | |
#------------------ PROCESSING LOOP ----------------------------- | |
# Ensure, before we start, that the API key is ok/valid, and internet access is ok | |
test_result = get_google_results("London, England", API_KEY, RETURN_FULL_RESULTS) | |
if (test_result['status'] != 'OK') or (test_result['formatted_address'] != 'London, UK'): | |
logger.warning("There was an error when testing the Google Geocoder.") | |
raise ConnectionError('Problem with test results from Google Geocode - check your API key and internet connection.') | |
# Create a list to hold results | |
results = [] | |
# Go through each address in turn | |
for address in addresses: | |
# While the address geocoding is not finished: | |
geocoded = False | |
while geocoded is not True: | |
# Geocode the address with google | |
try: | |
geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS) | |
except Exception as e: | |
logger.exception(e) | |
logger.error("Major error with {}".format(address)) | |
logger.error("Skipping!") | |
geocoded = True | |
# If we're over the API limit, backoff for a while and try again later. | |
if geocode_result['status'] == 'OVER_QUERY_LIMIT': | |
logger.info("Hit Query Limit! Backing off for a bit.") | |
time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes | |
geocoded = False | |
else: | |
# If we're ok with API use, save the results | |
# Note that the results might be empty / non-ok - log this | |
if geocode_result['status'] != 'OK': | |
logger.warning("Error geocoding {}: {}".format(address, geocode_result['status'])) | |
logger.debug("Geocoded: {}: {}".format(address, geocode_result['status'])) | |
results.append(geocode_result) | |
geocoded = True | |
# Print status every 100 addresses | |
if len(results) % 100 == 0: | |
logger.info("Completed {} of {} address".format(len(results), len(addresses))) | |
# Every 500 addresses, save progress to file(in case of a failure so you have something!) | |
if len(results) % 500 == 0: | |
pd.DataFrame(results).to_csv("{}_bak".format(output_filename)) | |
# All done | |
logger.info("Finished geocoding all addresses") | |
# Write the full results to csv using the pandas library. | |
pd.DataFrame(results).to_csv(output_filename, encoding='utf8') | |
Works awesome, thank you so much!
you have mentioned that in the case of multiple google geocode results, this function returns details of the FIRST result, what if the first result is wrong or somewhere else. I have found some case where google shows three partial matches out of which third one correct but first shows lat-long of some other countries.
Is it possible to filter address with city or zip code so that we can get maximum possible correct address(getting second or third address instead of first).
After geocoding about 50 addresses always getting : Hit Query Limit! Backing off for a bit.
Anyone have this problem?
Mine gets stuck after 130. Some weird throttling.
[Update] I got to know from the data logs that the string had # in it which was making it behave weirdly. So replace #, check the string on which it gets stuck and replace the culprit. It should work then.
This was so helpful. Appreciate it for the detailing.
When I try to run the script I'm getting "IOError: File testfile.csv does not exist". The csv is in the same folder as the script. I did not find any configuration regarding file paths so I only changed the file name. Can somebody explain where the csv has to be for this script to run properly?
You need to add a 'u' in front of the string on line 82 to process unicode correctly:
geocode_url = u"https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
Very helpful, thanks so much!!
Traceback (most recent call last):
File "python_batch_geocoding.py", line 135, in
raise ConnectionError('Problem with test results from Google Geocode - check your API key and internet connection.')
ConnectionError: Problem with test results from Google Geocode - check your API key and internet connection.
I have already check the API and internet connection which is successfully connected, but I got this.
You need to add a 'u' in front of the string on line 82 to process unicode correctly:
geocode_url = u"https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
Thank you so much!
Hi!
I'm not a python user but I'm looking for a geocoder. I try this one because it's written I can use it without API key (API=none). But I have a stop at line 127 (the test) :
Traceback (most recent call last):
File "Z:\OUTILS\SCRIPTS_PYTHON\Geocoder.py", line 127, in
test_result = get_google_results("London, England", API_KEY, RETURN_FULL_RESULTS)
File "Z:\OUTILS\SCRIPTS_PYTHON\Geocoder.py", line 87, in get_google_results
results = requests.get(geocode_url)
File "C:\python\ArcGIS10.6\lib\site-packages\requests\api.py", line 70, in get
return request('get', url, params=params, **kwargs)
File "C:\python\ArcGIS10.6\lib\site-packages\requests\api.py", line 56, in request
return session.request(method=method, url=url, **kwargs)
File "C:\python\ArcGIS10.6\lib\site-packages\requests\sessions.py", line 475, in request
resp = self.send(prep, **send_kwargs)
File "C:\python\ArcGIS10.6\lib\site-packages\requests\sessions.py", line 596, in send
r = adapter.send(request, **kwargs)
File "C:\python\ArcGIS10.6\lib\site-packages\requests\adapters.py", line 497, in send
raise SSLError(e, request=request)
SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
What is the problem? It looks like I have to use an API key.
Thanks
After geocoding about 50 addresses always getting : Hit Query Limit! Backing off for a bit.
Anyone have this problem?