Skip to content

Instantly share code, notes, and snippets.

@postandcourier
Created August 26, 2016 14:17
Show Gist options
  • Save postandcourier/d1be971f96e0752233c2fe075d20f5c0 to your computer and use it in GitHub Desktop.
Save postandcourier/d1be971f96e0752233c2fe075d20f5c0 to your computer and use it in GitHub Desktop.
Geocoding large dataset in R
# Script to geocode many items
# Modified by J Emory Parker
# Aug 25 2016
# Based on script originally by Shane Lynn
# http://www.shanelynn.ie/massive-geocoding-with-r-and-google-maps/
#load up the ggmap library
library(ggmap)
# get the input data
infile <- "input"
data <- read.csv(file="chs-data.csv",head=TRUE,sep=",")
# extract the addresses and fix some errors
data$address <- apply(data[,c('Street..', 'Street.Name')], 1, function(x) {
fixedStreet <- gsub(pattern = "\\/.*$", replacement = '', x[2], ignore.case = T)
paste(x[1], fixedStreet, 'Charleston, SC', sep=" ")
})
addresses <- data$address
addresses <- gsub(pattern = "\\s\\s", replacement = ' ', addresses, ignore.case = T)
addresses <- gsub(pattern = "â€\u0090", replacement = '-', addresses, ignore.case = T)
addresses <- gsub(pattern = " SC 526", replacement = 'I526', addresses, ignore.case = T)
addresses <- gsub(pattern = "^\\s", replacement = '', addresses, ignore.case = T)
addresses <- gsub(pattern = "12&", replacement = '', addresses, ignore.case = T)
#define a function that will process googles server responses for us.
getGeoDetails <- function(address){
#use the gecode function to query DST servers
geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE, source= 'dsk')
#now extract the bits that we need from the returned list
answer <- data.frame(lat=NA, long=NA, accuracy=NA, formatted_address=NA, address_type=NA, status=NA)
answer$status <- geo_reply$status
#return Na's if we didn't get a match:
if (geo_reply$status != "OK"){
return(answer)
}
#else, extract what we need from the Google server reply into a dataframe:
answer$lat <- geo_reply$results[[1]]$geometry$location$lat
answer$long <- geo_reply$results[[1]]$geometry$location$lng
if (length(geo_reply$results[[1]]$types) > 0){
answer$accuracy <- geo_reply$results[[1]]$types[[1]]
}
answer$address_type <- paste(geo_reply$results[[1]]$types, collapse=',')
answer$formatted_address <- geo_reply$results[[1]]$formatted_address
return(answer)
}
#initialise a dataframe to hold the results
geocoded <- data.frame()
# find out where to start in the address list (if the script was interrupted before):
startindex <- 1
#if a temp file exists - load it up and count the rows!
tempfilename <- paste0(infile, '_temp_geocoded.rds')
if (file.exists(tempfilename)){
print("Found temp file - resuming from index:")
geocoded <- readRDS(tempfilename)
startindex <- nrow(geocoded)
print(startindex)
}
# Start the geocoding process - address by address. geocode() function takes care of query speed limit.
for (ii in seq(startindex, length(addresses))){
print(paste("Working on index", ii, "of", length(addresses)))
#query the google geocoder - this will pause here if we are over the limit.
result = getGeoDetails(addresses[ii])
print(result$status)
result$index <- ii
#append the answer to the results file.
geocoded <- rbind(geocoded, result)
#save temporary results as we are going along
saveRDS(geocoded, tempfilename)
}
#now we add the latitude and longitude to the main data
data$lat <- geocoded$lat
data$long <- geocoded$long
data$accuracy <- geocoded$accuracy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment