Skip to content

Instantly share code, notes, and snippets.

@SymbolixAU
Last active September 22, 2020 05:52
Show Gist options
  • Save SymbolixAU/0f65a431a53da6bdaf5d60b8ff30eba9 to your computer and use it in GitHub Desktop.
Save SymbolixAU/0f65a431a53da6bdaf5d60b8ff30eba9 to your computer and use it in GitHub Desktop.
benchmarks for distance calculations
library(data.table)
library(dplyr)
library(sp)
library(rgeos)
library(UScensus2000tract)
library(geosphere)
library(geodist)
# load data and create an Origin-Destination matrix
data("oregon.tract")
# get centroids as a data.frame
centroids <- as.data.frame(gCentroid(oregon.tract,byid=TRUE))
# Convert row names into first column
setDT(centroids, keep.rownames = TRUE)[]
# create Origin-destination matrix
orig <- centroids[1:754, ]
dest <- centroids[2:755, ]
odmatrix <- bind_cols(orig,dest)
colnames(odmatrix) <- c("origi_id", "long_orig", "lat_orig", "dest_id", "long_dest", "lat_dest")
dt.haversine <- function(lat_from, lon_from, lat_to, lon_to, r = 6378137){
radians <- pi/180
lat_to <- lat_to * radians
lat_from <- lat_from * radians
lon_to <- lon_to * radians
lon_from <- lon_from * radians
dLat <- (lat_to - lat_from)
dLon <- (lon_to - lon_from)
a <- (sin(dLat/2)^2) + (cos(lat_from) * cos(lat_to)) * (sin(dLon/2)^2)
return(2 * atan2(sqrt(a), sqrt(1 - a)) * r)
}
library(Rcpp)
sourceCpp("./distance_calcs.cpp")
dt <- rbindlist(list(odmatrix, odmatrix, odmatrix, odmatrix, odmatrix, odmatrix))
dt <- rbindlist(list(dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt))
dt1 <- copy(dt); dt2 <- copy(dt); dt3 <- copy(dt); dt4 <- copy(dt); dt5 <- copy(dt)
res <- geodist::geodist(
x = dt5[, .(long_orig, lat_orig)]
, y = dt5[, .(long_dest, lat_dest)]
, paired = TRUE
)
dt51 <- dt5[, .(long_orig, lat_orig)]
dt52 <- dt5[, .(long_dest, lat_dest)]
library(microbenchmark)
microbenchmark(
geodist = {
## dt5[, dist := geodist::geodist(x = .SD[, .(long_orig, lat_orig)], y = .SD[, .(long_dest, lat_dest)], paired = T)]
# dt5[ , dist := geodist::geodist(x = dt5[, .(long_orig, lat_orig)]
# , y = dt5[, .(long_dest, lat_dest)]
# , paired = TRUE)]
dt5[, dist := geodist::geodist( x = dt51, y = dt52, paired = T)]
},
rcpp = {
dt4[, dist := rcpp_distance_haversine(lat_orig, long_orig, lat_dest, long_dest, tolerance = 10000000000.0)]
},
dtHaversine = {
dt1[, dist := dt.haversine(lat_orig, long_orig, lat_dest, long_dest)]
} ,
haversine = {
dt2[ , dist := distHaversine(matrix(c(long_orig, lat_orig), ncol = 2),
matrix(c(long_dest, lat_dest), ncol = 2))]
},
geo = {
dt3[ , dist := distGeo(matrix(c(long_orig, lat_orig), ncol = 2),
matrix(c(long_dest, lat_dest), ncol = 2))]
},
times = 5
)
#include <Rcpp.h>
using namespace Rcpp;
double inverseHaversine(double d){
return 2 * atan2(sqrt(d), sqrt(1 - d)) * 6378137.0;
}
double distanceHaversine(double latf, double lonf, double latt, double lont,
double tolerance){
double d;
double dlat = latt - latf;
double dlon = lont - lonf;
d = (sin(dlat * 0.5) * sin(dlat * 0.5)) + (cos(latf) * cos(latt)) * (sin(dlon * 0.5) * sin(dlon * 0.5));
if(d > 1 && d <= tolerance){
d = 1;
}
return inverseHaversine(d);
}
double toRadians(double deg){
return deg * 0.01745329251; // PI / 180;
}
// [[Rcpp::export]]
Rcpp::NumericVector rcpp_distance_haversine(Rcpp::NumericVector latFrom, Rcpp::NumericVector lonFrom,
Rcpp::NumericVector latTo, Rcpp::NumericVector lonTo,
double tolerance) {
int n = latFrom.size();
NumericVector distance(n);
double latf;
double latt;
double lonf;
double lont;
double dist = 0;
for(int i = 0; i < n; i++){
latf = toRadians(latFrom[i]);
lonf = toRadians(lonFrom[i]);
latt = toRadians(latTo[i]);
lont = toRadians(lonTo[i]);
dist = distanceHaversine(latf, lonf, latt, lont, tolerance);
distance[i] = dist;
}
return distance;
}
lons <- seq(-180, 180, by = 0.00001)
lats <- seq(-90, 90, by = 0.00001)
set.seed(20190719)
n <- 1e6
lon1 <- sample(lons, size = n, replace = T)
lat1 <- sample(lats, size = n, replace = T)
lon2 <- sample(lons, size = n, replace = T)
lat2 <- sample(lats, size = n, replace = T)
df1 <- data.frame(
x = lon1
, y = lat1
)
df2 <- data.frame(
x = lon2
, y = lat2
)
library(microbenchmark)
microbenchmark(
rcpp = {
rcpp_distance_haversine(lat1, lon1, lat2, lon2, 10000000000.0)
},
geodist = {
geodist::geodist(x = df1, y = df2, paired = T, measure = "haversine")
},
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# rcpp 95.822642 96.429467 98.5662817 98.16482300000001 100.716090 102.229619 10
# geodist 118.757472 121.395833 141.5120944 137.93193350000001 143.993625 215.095936 10
res_rcpp <- rcpp_distance_haversine(lat1, lon1, lat2, lon2, 10000000000.0)
res_geo <- geodist::geodist(x = df1, y = df2, paired = T, measure = "haversine")
all.equal(res_rcpp, res_geo)
# [1] TRUE
@mpadge
Copy link

mpadge commented Jul 24, 2019

Very interesting, and much indebted to you for digging into this David. The following results shed light on what's going on -- with spoiler at the outset being the comments about "pre-convert[ing] data.frame back to numeric vector, in order to call the geodist C function directly":

lons <- seq(-180, 180, by = 0.00001)
lats <- seq(-90, 90, by = 0.00001)

set.seed(20190719)
n <- 1e6
lon1 <- sample(lons, size = n, replace = T)
lat1 <- sample(lats, size = n, replace = T)

lon2 <- sample(lons, size = n, replace = T)
lat2 <- sample(lats, size = n, replace = T)

df1 <- data.frame(
  x = lon1
  , y = lat1
)
df2 <- data.frame(
  x = lon2
  , y = lat2
)
# pre-convert data.frame back to numeric vector, in order to call the geodist
# C function directly
vf1 <- as.vector (as.matrix (df1))
vf2 <- as.vector (as.matrix (df2))

library(microbenchmark)
devtools::load_all ("[... geodist dir ...]", export_all = TRUE)
#> Loading geodist
Rcpp::sourceCpp("distance_calcs.cpp")

microbenchmark(
    rcpp = {
        rcpp_distance_haversine(lat1, lon1, lat2, lon2, 10000000000.0)
    },
    geodist_Cpp = {
        geodist(x = df1, y = df2, paired = T, measure = "haversine")
    },
    geodist_C = {
        .Call ("R_haversine_paired", vf1, vf2)
    },
    times = 10
)
#> Unit: milliseconds
#>         expr      min       lq      mean   median        uq       max
#>         rcpp 95.53560 95.67128  96.87176 96.05306  98.62796 100.11463
#>  geodist_Cpp 91.73973 92.68199 100.45013 98.84186 103.79662 124.46476
#>    geodist_C 77.91810 77.95908  78.35440 78.09386  78.30085  80.42119

Created on 2019-07-24 by the reprex package (v0.3.0)

So the C function itself is indeed faster, but conversion from data.frame to numeric matrix/vector takes most of the C++ time. Interesting for us both - and @mdsumner - to appreciate that submitting individual vectors to an Rcpp function ought never be compared with submission of an otherwise equivalent data.frame of columns. Additionally interesting is that even direct submission of matrix objects gives same results as data.frame objects, so it is the as.vector() call that is taking most of the time. The conclusion here: Truly efficient C-level routines require direct submission of vector objects, as implemented in your Rcpp routine. More importantly for my current purposes: I'd love your input on how I might modify geodist to overcome these obvious shortcomings of the R-side API. Thanks loads for the constructive investigations!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment