Last active
September 22, 2020 05:52
-
-
Save SymbolixAU/0f65a431a53da6bdaf5d60b8ff30eba9 to your computer and use it in GitHub Desktop.
benchmarks for distance calculations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(dplyr) | |
library(sp) | |
library(rgeos) | |
library(UScensus2000tract) | |
library(geosphere) | |
library(geodist) | |
# load data and create an Origin-Destination matrix | |
data("oregon.tract") | |
# get centroids as a data.frame | |
centroids <- as.data.frame(gCentroid(oregon.tract,byid=TRUE)) | |
# Convert row names into first column | |
setDT(centroids, keep.rownames = TRUE)[] | |
# create Origin-destination matrix | |
orig <- centroids[1:754, ] | |
dest <- centroids[2:755, ] | |
odmatrix <- bind_cols(orig,dest) | |
colnames(odmatrix) <- c("origi_id", "long_orig", "lat_orig", "dest_id", "long_dest", "lat_dest") | |
dt.haversine <- function(lat_from, lon_from, lat_to, lon_to, r = 6378137){ | |
radians <- pi/180 | |
lat_to <- lat_to * radians | |
lat_from <- lat_from * radians | |
lon_to <- lon_to * radians | |
lon_from <- lon_from * radians | |
dLat <- (lat_to - lat_from) | |
dLon <- (lon_to - lon_from) | |
a <- (sin(dLat/2)^2) + (cos(lat_from) * cos(lat_to)) * (sin(dLon/2)^2) | |
return(2 * atan2(sqrt(a), sqrt(1 - a)) * r) | |
} | |
library(Rcpp) | |
sourceCpp("./distance_calcs.cpp") | |
dt <- rbindlist(list(odmatrix, odmatrix, odmatrix, odmatrix, odmatrix, odmatrix)) | |
dt <- rbindlist(list(dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt, dt)) | |
dt1 <- copy(dt); dt2 <- copy(dt); dt3 <- copy(dt); dt4 <- copy(dt); dt5 <- copy(dt) | |
res <- geodist::geodist( | |
x = dt5[, .(long_orig, lat_orig)] | |
, y = dt5[, .(long_dest, lat_dest)] | |
, paired = TRUE | |
) | |
dt51 <- dt5[, .(long_orig, lat_orig)] | |
dt52 <- dt5[, .(long_dest, lat_dest)] | |
library(microbenchmark) | |
microbenchmark( | |
geodist = { | |
## dt5[, dist := geodist::geodist(x = .SD[, .(long_orig, lat_orig)], y = .SD[, .(long_dest, lat_dest)], paired = T)] | |
# dt5[ , dist := geodist::geodist(x = dt5[, .(long_orig, lat_orig)] | |
# , y = dt5[, .(long_dest, lat_dest)] | |
# , paired = TRUE)] | |
dt5[, dist := geodist::geodist( x = dt51, y = dt52, paired = T)] | |
}, | |
rcpp = { | |
dt4[, dist := rcpp_distance_haversine(lat_orig, long_orig, lat_dest, long_dest, tolerance = 10000000000.0)] | |
}, | |
dtHaversine = { | |
dt1[, dist := dt.haversine(lat_orig, long_orig, lat_dest, long_dest)] | |
} , | |
haversine = { | |
dt2[ , dist := distHaversine(matrix(c(long_orig, lat_orig), ncol = 2), | |
matrix(c(long_dest, lat_dest), ncol = 2))] | |
}, | |
geo = { | |
dt3[ , dist := distGeo(matrix(c(long_orig, lat_orig), ncol = 2), | |
matrix(c(long_dest, lat_dest), ncol = 2))] | |
}, | |
times = 5 | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <Rcpp.h> | |
using namespace Rcpp; | |
double inverseHaversine(double d){ | |
return 2 * atan2(sqrt(d), sqrt(1 - d)) * 6378137.0; | |
} | |
double distanceHaversine(double latf, double lonf, double latt, double lont, | |
double tolerance){ | |
double d; | |
double dlat = latt - latf; | |
double dlon = lont - lonf; | |
d = (sin(dlat * 0.5) * sin(dlat * 0.5)) + (cos(latf) * cos(latt)) * (sin(dlon * 0.5) * sin(dlon * 0.5)); | |
if(d > 1 && d <= tolerance){ | |
d = 1; | |
} | |
return inverseHaversine(d); | |
} | |
double toRadians(double deg){ | |
return deg * 0.01745329251; // PI / 180; | |
} | |
// [[Rcpp::export]] | |
Rcpp::NumericVector rcpp_distance_haversine(Rcpp::NumericVector latFrom, Rcpp::NumericVector lonFrom, | |
Rcpp::NumericVector latTo, Rcpp::NumericVector lonTo, | |
double tolerance) { | |
int n = latFrom.size(); | |
NumericVector distance(n); | |
double latf; | |
double latt; | |
double lonf; | |
double lont; | |
double dist = 0; | |
for(int i = 0; i < n; i++){ | |
latf = toRadians(latFrom[i]); | |
lonf = toRadians(lonFrom[i]); | |
latt = toRadians(latTo[i]); | |
lont = toRadians(lonTo[i]); | |
dist = distanceHaversine(latf, lonf, latt, lont, tolerance); | |
distance[i] = dist; | |
} | |
return distance; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lons <- seq(-180, 180, by = 0.00001) | |
lats <- seq(-90, 90, by = 0.00001) | |
set.seed(20190719) | |
n <- 1e6 | |
lon1 <- sample(lons, size = n, replace = T) | |
lat1 <- sample(lats, size = n, replace = T) | |
lon2 <- sample(lons, size = n, replace = T) | |
lat2 <- sample(lats, size = n, replace = T) | |
df1 <- data.frame( | |
x = lon1 | |
, y = lat1 | |
) | |
df2 <- data.frame( | |
x = lon2 | |
, y = lat2 | |
) | |
library(microbenchmark) | |
microbenchmark( | |
rcpp = { | |
rcpp_distance_haversine(lat1, lon1, lat2, lon2, 10000000000.0) | |
}, | |
geodist = { | |
geodist::geodist(x = df1, y = df2, paired = T, measure = "haversine") | |
}, | |
times = 10 | |
) | |
# Unit: milliseconds | |
# expr min lq mean median uq max neval | |
# rcpp 95.822642 96.429467 98.5662817 98.16482300000001 100.716090 102.229619 10 | |
# geodist 118.757472 121.395833 141.5120944 137.93193350000001 143.993625 215.095936 10 | |
res_rcpp <- rcpp_distance_haversine(lat1, lon1, lat2, lon2, 10000000000.0) | |
res_geo <- geodist::geodist(x = df1, y = df2, paired = T, measure = "haversine") | |
all.equal(res_rcpp, res_geo) | |
# [1] TRUE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very interesting, and much indebted to you for digging into this David. The following results shed light on what's going on -- with spoiler at the outset being the comments about "pre-convert[ing] data.frame back to numeric vector, in order to call the geodist C function directly":
Created on 2019-07-24 by the reprex package (v0.3.0)
So the C function itself is indeed faster, but conversion from
data.frame
to numeric matrix/vector takes most of the C++ time. Interesting for us both - and @mdsumner - to appreciate that submitting individual vectors to an Rcpp function ought never be compared with submission of an otherwise equivalentdata.frame
of columns. Additionally interesting is that even direct submission ofmatrix
objects gives same results asdata.frame
objects, so it is theas.vector()
call that is taking most of the time. The conclusion here: Truly efficient C-level routines require direct submission of vector objects, as implemented in your Rcpp routine. More importantly for my current purposes: I'd love your input on how I might modifygeodist
to overcome these obvious shortcomings of the R-side API. Thanks loads for the constructive investigations!