Created
August 26, 2016 14:17
-
-
Save postandcourier/d1be971f96e0752233c2fe075d20f5c0 to your computer and use it in GitHub Desktop.
Geocoding large dataset in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to geocode many items | |
# Modified by J Emory Parker | |
# Aug 25 2016 | |
# Based on script originally by Shane Lynn | |
# http://www.shanelynn.ie/massive-geocoding-with-r-and-google-maps/ | |
#load up the ggmap library | |
library(ggmap) | |
# get the input data | |
infile <- "input" | |
data <- read.csv(file="chs-data.csv",head=TRUE,sep=",") | |
# extract the addresses and fix some errors | |
data$address <- apply(data[,c('Street..', 'Street.Name')], 1, function(x) { | |
fixedStreet <- gsub(pattern = "\\/.*$", replacement = '', x[2], ignore.case = T) | |
paste(x[1], fixedStreet, 'Charleston, SC', sep=" ") | |
}) | |
addresses <- data$address | |
addresses <- gsub(pattern = "\\s\\s", replacement = ' ', addresses, ignore.case = T) | |
addresses <- gsub(pattern = "â€\u0090", replacement = '-', addresses, ignore.case = T) | |
addresses <- gsub(pattern = " SC 526", replacement = 'I526', addresses, ignore.case = T) | |
addresses <- gsub(pattern = "^\\s", replacement = '', addresses, ignore.case = T) | |
addresses <- gsub(pattern = "12&", replacement = '', addresses, ignore.case = T) | |
#define a function that will process googles server responses for us. | |
getGeoDetails <- function(address){ | |
#use the gecode function to query DST servers | |
geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE, source= 'dsk') | |
#now extract the bits that we need from the returned list | |
answer <- data.frame(lat=NA, long=NA, accuracy=NA, formatted_address=NA, address_type=NA, status=NA) | |
answer$status <- geo_reply$status | |
#return Na's if we didn't get a match: | |
if (geo_reply$status != "OK"){ | |
return(answer) | |
} | |
#else, extract what we need from the Google server reply into a dataframe: | |
answer$lat <- geo_reply$results[[1]]$geometry$location$lat | |
answer$long <- geo_reply$results[[1]]$geometry$location$lng | |
if (length(geo_reply$results[[1]]$types) > 0){ | |
answer$accuracy <- geo_reply$results[[1]]$types[[1]] | |
} | |
answer$address_type <- paste(geo_reply$results[[1]]$types, collapse=',') | |
answer$formatted_address <- geo_reply$results[[1]]$formatted_address | |
return(answer) | |
} | |
#initialise a dataframe to hold the results | |
geocoded <- data.frame() | |
# find out where to start in the address list (if the script was interrupted before): | |
startindex <- 1 | |
#if a temp file exists - load it up and count the rows! | |
tempfilename <- paste0(infile, '_temp_geocoded.rds') | |
if (file.exists(tempfilename)){ | |
print("Found temp file - resuming from index:") | |
geocoded <- readRDS(tempfilename) | |
startindex <- nrow(geocoded) | |
print(startindex) | |
} | |
# Start the geocoding process - address by address. geocode() function takes care of query speed limit. | |
for (ii in seq(startindex, length(addresses))){ | |
print(paste("Working on index", ii, "of", length(addresses))) | |
#query the google geocoder - this will pause here if we are over the limit. | |
result = getGeoDetails(addresses[ii]) | |
print(result$status) | |
result$index <- ii | |
#append the answer to the results file. | |
geocoded <- rbind(geocoded, result) | |
#save temporary results as we are going along | |
saveRDS(geocoded, tempfilename) | |
} | |
#now we add the latitude and longitude to the main data | |
data$lat <- geocoded$lat | |
data$long <- geocoded$long | |
data$accuracy <- geocoded$accuracy |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment