Last active
December 31, 2015 00:49
-
-
Save arunsrinivasan/7909609 to your computer and use it in GitHub Desktop.
Comparison of pandas with data.table (along with base:::merge and plyr:::join)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(data.table) | |
# tested on 1.8.10 AND 1.8.11, results don't differ much at all. | |
require(plyr) | |
# 1_8 version | |
set.seed(1000) # for reproducibility | |
N <- 1e4 | |
foo <- function() paste(sample(letters, 10), collapse="") | |
indices <- replicate(N, foo()) | |
indices2 <- replicate(N, foo()) | |
left <- data.frame(key=rep(indices[1:8000], 10), key2=rep(indices2[1:8000], 10), value=rnorm(80000)) | |
right <- data.frame(key=indices[2001:10000], key2=indices2[2001:10000], value2=rnorm(8000)) | |
right2 <- data.frame(key=rep(right$key, 2), key2=rep(right$key2, 2), value2=rnorm(16000)) | |
left.dt <- data.table(left) | |
system.time(setkey(left.dt, key, key2)) | |
# user system elapsed | |
# 0.020 0.001 0.021 | |
right.dt <- data.table(right) | |
system.time(setkey(right.dt, key, key2)) | |
# user system elapsed | |
# 0.004 0.000 0.004 | |
right2.dt <- data.table(right2) | |
system.time(setkey(right2.dt, key, key2)) | |
# user system elapsed | |
# 0.007 0.000 0.008 | |
## Benchmarking function | |
## --------------------- | |
timeit <- function(func, niter=10) { | |
timing = rep(NA, niter) | |
for (i in 1:niter) { | |
gc() | |
timing[i] <- system.time(func())[3] | |
} | |
mean(timing) | |
} | |
## base:::merge | |
## ------------ | |
left.join <- function(sort=FALSE) result <- base::merge(left, right, all.x=TRUE, sort=sort) | |
right.join <- function(sort=FALSE) result <- base::merge(left, right, all.y=TRUE, sort=sort) | |
outer.join <- function(sort=FALSE) result <- base::merge(left, right, all=TRUE, sort=sort) | |
inner.join <- function(sort=FALSE) result <- base::merge(left, right, all=FALSE, sort=sort) | |
## data.table merge/join | |
## --------------------- | |
left.join.dt <- function(sort=FALSE) result <- right.dt[left.dt] | |
right.join.dt <- function(sort=FALSE) result <- left.dt[right.dt] | |
outer.join.dt <- function(sort=FALSE) result <- merge(left.dt, right.dt, all=TRUE, sort=sort) | |
inner.join.dt <- function(sort=FALSE) result <- merge(left.dt, right.dt, all=FALSE, sort=sort) | |
## plyr merge/join | |
## --------------- | |
plyr.join <- function(type) result <- plyr::join(left, right, by=c("key", "key2"), type=type, match="first") | |
sort.options <- c(FALSE, TRUE) | |
## ----------- | |
## many-to-one | |
## ----------- | |
results <- matrix(nrow=4, ncol=3) | |
colnames(results) <- c("base::merge", "plyr", "data.table") | |
rownames(results) <- c("inner", "outer", "left", "right") | |
base.functions <- c(inner.join, outer.join, left.join, right.join) | |
plyr.functions <- c(function() plyr.join("inner"), function() plyr.join("full"), | |
function() plyr.join("left"), function() plyr.join("right")) | |
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) | |
for (i in 1:4) { | |
base.func <- base.functions[[i]] | |
plyr.func <- plyr.functions[[i]] | |
dt.func <- dt.functions[[i]] | |
results[i, 1] <- timeit(base.func) | |
results[i, 2] <- timeit(plyr.func) | |
results[i, 3] <- timeit(dt.func) | |
} | |
results | |
# base::merge plyr data.table | |
# inner 0.9587 0.6214 0.0734 | |
# outer 1.4238 0.8191 0.2116 | |
# left 1.1039 0.6241 0.0528 | |
# right 1.1714 0.3678 0.0258 | |
## ------------ | |
## many-to-many | |
## ------------ | |
left.join <- function(sort=FALSE) result <- base::merge(left, right2, all.x=TRUE, sort=sort) | |
right.join <- function(sort=FALSE) result <- base::merge(left, right2, all.y=TRUE, sort=sort) | |
outer.join <- function(sort=FALSE) result <- base::merge(left, right2, all=TRUE, sort=sort) | |
inner.join <- function(sort=FALSE) result <- base::merge(left, right2, all=FALSE, sort=sort) | |
left.join.dt <- function(sort=FALSE) result <- right2.dt[left.dt, allow.cartesian=TRUE] | |
right.join.dt <- function(sort=FALSE) result <- left.dt[right2.dt, allow.cartesian=TRUE] | |
outer.join.dt <- function(sort=FALSE) result <- merge(left.dt, right2.dt, all=TRUE, sort=sort, allow.cartesian=TRUE) | |
inner.join.dt <- function(sort=FALSE) result <- merge(left.dt, right2.dt, all=FALSE, sort=sort, allow.cartesian=TRUE) | |
sort.options <- c(FALSE, TRUE) | |
results <- matrix(nrow=4, ncol=3) | |
colnames(results) <- c("base::merge", "plyr", "data.table") | |
rownames(results) <- c("inner", "outer", "left", "right") | |
base.functions <- c(inner.join, outer.join, left.join, right.join) | |
plyr.functions <- c(function() plyr.join("inner"), function() plyr.join("full"), | |
function() plyr.join("left"), function() plyr.join("right")) | |
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) | |
for (i in 1:4) { | |
base.func <- base.functions[[i]] | |
plyr.func <- plyr.functions[[i]] | |
dt.func <- dt.functions[[i]] | |
results[i, 1] <- timeit(base.func) | |
results[i, 2] <- timeit(plyr.func) | |
results[i, 3] <- timeit(dt.func) | |
} | |
results | |
# base::merge plyr data.table | |
# inner 1.9289 0.6080 0.0917 | |
# outer 2.8860 0.8485 0.3490 | |
# left 2.1343 0.6279 0.0758 | |
# right 2.3700 0.3745 0.0429 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment