Last active
          September 22, 2020 05:52 
        
      - 
      
- 
        Save SymbolixAU/0f65a431a53da6bdaf5d60b8ff30eba9 to your computer and use it in GitHub Desktop. 
    benchmarks for distance calculations 
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | library(data.table) | |
| library(dplyr) | |
| library(sp) | |
| library(rgeos) | |
| library(UScensus2000tract) | |
| library(geosphere) | |
| library(geodist) | |
| # load data and create an Origin-Destination matrix | |
| data("oregon.tract") | |
| # get centroids as a data.frame | |
| centroids <- as.data.frame(gCentroid(oregon.tract,byid=TRUE)) | |
| # Convert row names into first column | |
| setDT(centroids, keep.rownames = TRUE)[] | |
| # create Origin-destination matrix | |
| orig <- centroids[1:754, ] | |
| dest <- centroids[2:755, ] | |
| odmatrix <- bind_cols(orig,dest) | |
| colnames(odmatrix) <- c("origi_id", "long_orig", "lat_orig", "dest_id", "long_dest", "lat_dest") | |
| dt.haversine <- function(lat_from, lon_from, lat_to, lon_to, r = 6378137){ | |
| radians <- pi/180 | |
| lat_to <- lat_to * radians | |
| lat_from <- lat_from * radians | |
| lon_to <- lon_to * radians | |
| lon_from <- lon_from * radians | |
| dLat <- (lat_to - lat_from) | |
| dLon <- (lon_to - lon_from) | |
| a <- (sin(dLat/2)^2) + (cos(lat_from) * cos(lat_to)) * (sin(dLon/2)^2) | |
| return(2 * atan2(sqrt(a), sqrt(1 - a)) * r) | |
| } | |
| library(Rcpp) | |
| sourceCpp("./distance_calcs.cpp") | |
| dt <- rbindlist(list(odmatrix, odmatrix, odmatrix, odmatrix, odmatrix, odmatrix)) | |
| dt <- rbindlist(list(dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt)) | |
| dt1 <- copy(dt); dt2 <- copy(dt); dt3 <- copy(dt); dt4 <- copy(dt); dt5 <- copy(dt) | |
| res <- geodist::geodist( | |
| x = dt5[, .(long_orig, lat_orig)] | |
| , y = dt5[, .(long_dest, lat_dest)] | |
| , paired = TRUE | |
| ) | |
| dt51 <- dt5[, .(long_orig, lat_orig)] | |
| dt52 <- dt5[, .(long_dest, lat_dest)] | |
| library(microbenchmark) | |
| microbenchmark( | |
| geodist = { | |
| ## dt5[, dist := geodist::geodist(x = .SD[, .(long_orig, lat_orig)], y = .SD[, .(long_dest, lat_dest)], paired = T)] | |
| # dt5[ , dist := geodist::geodist(x = dt5[, .(long_orig, lat_orig)] | |
| # , y = dt5[, .(long_dest, lat_dest)] | |
| # , paired = TRUE)] | |
| dt5[, dist := geodist::geodist( x = dt51, y = dt52, paired = T)] | |
| }, | |
| rcpp = { | |
| dt4[, dist := rcpp_distance_haversine(lat_orig, long_orig, lat_dest, long_dest, tolerance = 10000000000.0)] | |
| }, | |
| dtHaversine = { | |
| dt1[, dist := dt.haversine(lat_orig, long_orig, lat_dest, long_dest)] | |
| } , | |
| haversine = { | |
| dt2[ , dist := distHaversine(matrix(c(long_orig, lat_orig), ncol = 2), | |
| matrix(c(long_dest, lat_dest), ncol = 2))] | |
| }, | |
| geo = { | |
| dt3[ , dist := distGeo(matrix(c(long_orig, lat_orig), ncol = 2), | |
| matrix(c(long_dest, lat_dest), ncol = 2))] | |
| }, | |
| times = 5 | |
| ) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #include <Rcpp.h> | |
| using namespace Rcpp; | |
| double inverseHaversine(double d){ | |
| return 2 * atan2(sqrt(d), sqrt(1 - d)) * 6378137.0; | |
| } | |
| double distanceHaversine(double latf, double lonf, double latt, double lont, | |
| double tolerance){ | |
| double d; | |
| double dlat = latt - latf; | |
| double dlon = lont - lonf; | |
| d = (sin(dlat * 0.5) * sin(dlat * 0.5)) + (cos(latf) * cos(latt)) * (sin(dlon * 0.5) * sin(dlon * 0.5)); | |
| if(d > 1 && d <= tolerance){ | |
| d = 1; | |
| } | |
| return inverseHaversine(d); | |
| } | |
| double toRadians(double deg){ | |
| return deg * 0.01745329251; // PI / 180; | |
| } | |
| // [[Rcpp::export]] | |
| Rcpp::NumericVector rcpp_distance_haversine(Rcpp::NumericVector latFrom, Rcpp::NumericVector lonFrom, | |
| Rcpp::NumericVector latTo, Rcpp::NumericVector lonTo, | |
| double tolerance) { | |
| int n = latFrom.size(); | |
| NumericVector distance(n); | |
| double latf; | |
| double latt; | |
| double lonf; | |
| double lont; | |
| double dist = 0; | |
| for(int i = 0; i < n; i++){ | |
| latf = toRadians(latFrom[i]); | |
| lonf = toRadians(lonFrom[i]); | |
| latt = toRadians(latTo[i]); | |
| lont = toRadians(lonTo[i]); | |
| dist = distanceHaversine(latf, lonf, latt, lont, tolerance); | |
| distance[i] = dist; | |
| } | |
| return distance; | |
| } | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | lons <- seq(-180, 180, by = 0.00001) | |
| lats <- seq(-90, 90, by = 0.00001) | |
| set.seed(20190719) | |
| n <- 1e6 | |
| lon1 <- sample(lons, size = n, replace = T) | |
| lat1 <- sample(lats, size = n, replace = T) | |
| lon2 <- sample(lons, size = n, replace = T) | |
| lat2 <- sample(lats, size = n, replace = T) | |
| df1 <- data.frame( | |
| x = lon1 | |
| , y = lat1 | |
| ) | |
| df2 <- data.frame( | |
| x = lon2 | |
| , y = lat2 | |
| ) | |
| library(microbenchmark) | |
| microbenchmark( | |
| rcpp = { | |
| rcpp_distance_haversine(lat1, lon1, lat2, lon2, 10000000000.0) | |
| }, | |
| geodist = { | |
| geodist::geodist(x = df1, y = df2, paired = T, measure = "haversine") | |
| }, | |
| times = 10 | |
| ) | |
| # Unit: milliseconds | |
| # expr min lq mean median uq max neval | |
| # rcpp 95.822642 96.429467 98.5662817 98.16482300000001 100.716090 102.229619 10 | |
| # geodist 118.757472 121.395833 141.5120944 137.93193350000001 143.993625 215.095936 10 | |
| res_rcpp <- rcpp_distance_haversine(lat1, lon1, lat2, lon2, 10000000000.0) | |
| res_geo <- geodist::geodist(x = df1, y = df2, paired = T, measure = "haversine") | |
| all.equal(res_rcpp, res_geo) | |
| # [1] TRUE | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
Very interesting, and much indebted to you for digging into this David. The following results shed light on what's going on -- with spoiler at the outset being the comments about "pre-convert[ing] data.frame back to numeric vector, in order to call the geodist C function directly":
Created on 2019-07-24 by the reprex package (v0.3.0)
So the C function itself is indeed faster, but conversion from
data.frameto numeric matrix/vector takes most of the C++ time. Interesting for us both - and @mdsumner - to appreciate that submitting individual vectors to an Rcpp function ought never be compared with submission of an otherwise equivalentdata.frameof columns. Additionally interesting is that even direct submission ofmatrixobjects gives same results asdata.frameobjects, so it is theas.vector()call that is taking most of the time. The conclusion here: Truly efficient C-level routines require direct submission of vector objects, as implemented in your Rcpp routine. More importantly for my current purposes: I'd love your input on how I might modifygeodistto overcome these obvious shortcomings of the R-side API. Thanks loads for the constructive investigations!