Skip to content

Instantly share code, notes, and snippets.

@havran
Created July 19, 2016 11:45
Show Gist options
  • Save havran/add179970a2e57ccfe53c714ea582087 to your computer and use it in GitHub Desktop.
Save havran/add179970a2e57ccfe53c714ea582087 to your computer and use it in GitHub Desktop.
# Geocoding script for large list of addresses.
# Shane Lynn 10/10/2013
#load up the ggmap library
library(ggmap)
# get the input data
infile <- "input"
data <- read.csv(paste0('c:\\Temp\\R scripts\\', infile, '.csv'), fileEncoding = "UTF-8")
# get the address list, and append "Ireland" to the end to increase accuracy
# (change or remove this if your address already include a country etc.)
addresses = data$Address
addresses = gsub('\\s+', '+', enc2utf8(paste0(addresses, ", Slovakia")))
#define a function that will process googles server responses for us.
getGeoDetails <- function(address){
#use the gecode function to query google servers
geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE)
#now extract the bits that we need from the returned list
answer <- data.frame(lat=NA, long=NA, accuracy=NA, formatted_address=NA, address_type=NA, status=NA)
answer$status <- geo_reply$status
#if we are over the query limit - want to pause for an hour
while(geo_reply$status == "OVER_QUERY_LIMIT"){
print("OVER QUERY LIMIT - Pausing for 1 hour at:")
time <- Sys.time()
print(as.character(time))
Sys.sleep(60*60)
geo_reply = geocode(address, output='all', messaging=TRUE, override_limit=TRUE)
answer$status <- geo_reply$status
}
#return Na's if we didn't get a match:
if (geo_reply$status != "OK"){
return(answer)
}
#else, extract what we need from the Google server reply into a dataframe:
answer$lat <- geo_reply$results[[1]]$geometry$location$lat
answer$long <- geo_reply$results[[1]]$geometry$location$lng
if (length(geo_reply$results[[1]]$types) > 0){
answer$accuracy <- geo_reply$results[[1]]$types[[1]]
}
answer$address_type <- paste(geo_reply$results[[1]]$types, collapse=',')
answer$formatted_address <- geo_reply$results[[1]]$formatted_address
return(answer)
}
#initialise a dataframe to hold the results
geocoded <- data.frame()
# find out where to start in the address list (if the script was interrupted before):
startindex <- 1
#if a temp file exists - load it up and count the rows!
tempfilename <- paste0(infile, '_temp_geocoded.rds')
if (file.exists(tempfilename)){
print("Found temp file - resuming from index:")
geocoded <- readRDS(tempfilename)
startindex <- nrow(geocoded)
print(startindex)
}
# Start the geocoding process - address by address. geocode() function takes care of query speed limit.
for (ii in seq(startindex, length(addresses))){
print(paste("Working on index", ii, "of", length(addresses)))
#query the google geocoder - this will pause here if we are over the limit.
result = getGeoDetails(addresses[ii])
print(result$status)
result$index <- ii
#append the answer to the results file.
geocoded <- rbind(geocoded, result)
#save temporary results as we are going along
saveRDS(geocoded, tempfilename)
}
#now we add the latitude and longitude to the main data
data$lat <- geocoded$lat
data$long <- geocoded$lat
data$accuracy <- geocoded$accuracy
data$formatted_address <- geocoded$formatted_address
#finally write it all to the output files
saveRDS(data, paste0("c:\\Temp\\R scripts\\", infile ,"_geocoded.rds"))
write.table(data, file=paste0("c:\\Temp\\R scripts\\", infile ,"_geocoded.csv"), sep=",", row.names=FALSE, fileEncoding = "UTF-8")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment