Created
December 7, 2013 17:44
-
-
Save arunsrinivasan/7846014 to your computer and use it in GitHub Desktop.
Benchmarking dplyr and data.table 1.8.11 commit 1048 (with lesser groups)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# version 1.8.11 (commit 1048) | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.11 For help type: help("data.table") | |
## create a huge data.table: | |
## ------------------------- | |
set.seed(1) | |
N <- 2e7 # size of DT | |
# generate a character vector of length about 1e5 | |
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="") | |
ch <- replicate(1e3, foo()) | |
ch <- unique(ch) | |
# > length(ch) | |
# [1] 1000 | |
# DT now | |
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e3)*1e6), N, TRUE)), | |
b = as.numeric(sample(rnorm(1e4), N, TRUE)), | |
c = sample(c(-1000:1000), N, TRUE), | |
d = sample(ch, N, TRUE)) | |
DT.cp <- copy(DT) | |
system.time(setkey(DT.cp, c)) | |
# user system elapsed | |
# 5.931 0.276 6.340 | |
tables() # memory footprint | |
# NAME NROW MB COLS KEY | |
# [1,] DT 20,000,000 535 a,b,c,d c | |
# [2,] DT.cp 20,000,000 535 a,b,c,d c | |
# Total: 1,070MB | |
require(dplyr) # as of 6th December | |
# creating grouped_df from 'dplyr' | |
DF <- tbl_df(data.frame(DT)) | |
system.time(DF.cp <- group_by(DF, c)) | |
# user system elapsed | |
# 5.166 1.171 6.394 | |
# memory footprint | |
print(object.size(DF), units='Mb') # 534.1 Mb | |
print(object.size(DF.cp), units='Mb') # 534.1 Mb | |
## Borrowing timing function from Hadley: | |
## -------------------------------------- | |
benchmark <- function(code) { | |
code <- substitute(code) | |
rbind( | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())) | |
) | |
} | |
## ---------------------------------------------------------------------------------- | |
## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data) | |
## ---------------------------------------------------------------------------------- | |
# 1a) DF vector-scan subset | |
benchmark(DF[DF$d == "ewdjgq", ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 4.097 0.358 4.531 0 0 | |
# [2,] 3.970 0.008 4.053 0 0 | |
# [3,] 3.959 0.004 3.980 0 0 | |
benchmark(DF[DF$c == 169073, ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 3.398 0.110 3.546 0 0 | |
# [2,] 3.369 0.106 3.489 0 0 | |
# [3,] 3.377 0.110 3.557 0 0 | |
# 1b) ordinary DT vector-scan subset | |
benchmark(DT[d == "ewdjgq"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.749 0.003 0.775 0 0 | |
# [2,] 0.746 0.002 0.756 0 0 | |
# [3,] 0.748 0.002 0.763 0 0 | |
benchmark(DT[c == 169073]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.169 0.104 0.274 0 0 | |
# [2,] 0.171 0.105 0.277 0 0 | |
# [3,] 0.172 0.108 0.282 0 0 | |
# 1c) dplyr's 'filter' | |
benchmark(filter(DF, d == "ewdjgq")) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.845 0.002 0.913 0 0 | |
# [2,] 0.840 0.001 0.847 0 0 | |
# [3,] 0.843 0.001 0.847 0 0 | |
benchmark(filter(DF, c == 169073)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.268 0.111 0.379 0 0 | |
# [2,] 0.266 0.111 0.377 0 0 | |
# [3,] 0.268 0.106 0.374 0 0 | |
## ------------------------------------------------------------------------------ | |
## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data) | |
## ------------------------------------------------------------------------------ | |
# 1a) DF vector-scan subset | |
benchmark(DF.cp[DF.cp$d == "ewdjgq", ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 4.002 0.015 4.481 0 0 | |
# [2,] 3.941 0.005 3.974 0 0 | |
# [3,] 3.959 0.004 3.993 0 0 | |
benchmark(DF.cp[DF.cp$c == 169073, ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 3.420 0.120 3.690 0 0 | |
# [2,] 3.415 0.121 3.696 0 0 | |
# [3,] 3.429 0.119 3.723 0 0 | |
# 1b) ordinary DT vector-scan subset | |
benchmark(DT.cp[d == "ewdjgq"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.746 0.003 0.779 0 0 | |
# [2,] 0.744 0.003 0.825 0 0 | |
# [3,] 0.744 0.004 1.011 0 0 | |
benchmark(DT.cp[c == 169073]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.171 0.106 0.279 0 0 | |
# [2,] 0.173 0.107 0.280 0 0 | |
# [3,] 0.169 0.104 0.274 0 0 | |
# 1c) dplyr's 'filter' | |
benchmark(filter(DF.cp, d == "ewdjgq")) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.943 0.050 1.017 0 0 | |
# [2,] 0.943 0.049 1.002 0 0 | |
# [3,] 0.942 0.046 1.054 0 0 | |
benchmark(filter(DF.cp, c == 169073)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.351 0.157 0.510 0 0 | |
# [2,] 0.347 0.150 0.497 0 0 | |
# [3,] 0.350 0.144 0.504 0 0 | |
# 1d) data.table's binary search | |
benchmark(DT.cp[J(169073)]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.003 0 0.003 0 0 | |
# [2,] 0.002 0 0.002 0 0 | |
# [3,] 0.002 0 0.002 0 0 | |
# 1e) dplyr's join approach (doesn't use keys though): | |
benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c"))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.054 0.095 1.200 0 0 | |
# [2,] 1.038 0.091 1.206 0 0 | |
# [3,] 1.032 0.087 1.129 0 0 | |
## ------------------------------------------------------------- | |
## 3) Comparing "arrange" (ordering) from dplyr with data.table: | |
## ------------------------------------------------------------- | |
benchmark(arrange(DF.cp, b,c)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 41.444 0.503 43.481 0 0 | |
# [2,] 40.557 0.458 41.569 0 0 | |
# [3,] 40.066 0.433 40.995 0 0 | |
benchmark(setkey(copy(DT), b,c)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 11.047 1.452 12.616 0 0 | |
# [2,] 11.032 0.886 12.154 0 0 | |
# [3,] 11.490 0.953 14.389 0 0 | |
## ------------------------------------------------- | |
## 4) Comparing "mutate" from dplyr with data.table: | |
## ------------------------------------------------- | |
# The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object, | |
# where as ':=' (or 'set') modifies the same object by reference. | |
# to make the comparison fair, I'll use 'set' on a copy' everytime. | |
benchmark(mutate(DF.cp, e=a+b)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.218 0.195 0.416 0 0 | |
# [2,] 0.216 0.193 0.411 0 0 | |
# [3,] 0.222 0.189 0.419 0 0 | |
# run this 3 times | |
setkey(DT.cp <- copy(DT), c) | |
system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b)) | |
# user system elapsed | |
# 0.234 0.212 0.448 | |
# 0.235 0.205 0.450 | |
# 0.239 0.212 0.461 | |
## -------------------------------------------------------------------------- | |
## 5) Comparing "join" from dplyr with data.table: (on character column here) | |
## -------------------------------------------------------------------------- | |
DF.cp <- group_by(DF, d) | |
setkey(DT.cp <- copy(DT), d) | |
set.seed(1) | |
DF.j <- data.frame(d = sample(ch, 1e2, FALSE), stringsAsFactors=FALSE) | |
DT.j <- data.table(DF.j) # no key on DT.j | |
benchmark(left_join(DF.j, DF.cp, by="d")) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.359 0.197 1.571 0 0 | |
# [2,] 1.371 0.164 1.613 0 0 | |
# [3,] 1.389 0.174 1.720 0 0 | |
benchmark(DT.cp[DT.j]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.139 0.013 0.153 0 0 | |
# [2,] 0.139 0.004 0.151 0 0 | |
# [3,] 0.140 0.004 0.147 0 0 | |
## ---------------------------------------------------------------------------------- | |
## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups | |
## ---------------------------------------------------------------------------------- | |
# with the groupings on character col. "d" | |
# 6a. with C-run function of dplyr | |
benchmark(summarise(DF.cp, m.b = sum(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.202 0.000 0.203 0 0 | |
# [2,] 0.197 0.000 0.199 0 0 | |
# [3,] 0.200 0.001 0.210 0 0 | |
# 6b. evaluating the function instead | |
sum__ <- sum | |
benchmark(summarise(DF.cp, m.b = sum__(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.202 0.122 0.324 0 0 | |
# [2,] 0.207 0.129 0.353 0 0 | |
# [3,] 0.207 0.127 0.340 0 0 | |
# 6c. data.table way | |
benchmark(DT.cp[, list(m.b=sum(b)), by=d]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.460 0.112 0.575 0 0 | |
# [2,] 0.454 0.117 0.572 0 0 | |
# [3,] 0.451 0.117 0.568 0 0 | |
## ---------------------------------------------------------------------------------- | |
## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups | |
## ---------------------------------------------------------------------------------- | |
# with the groupings on character col. "d" | |
# 7a. with C-run function of dplyr | |
DF.cp <- group_by(DF, c) | |
benchmark(summarise(DF.cp, m.b = sum(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.199 0.000 0.204 0 0 | |
# [2,] 0.197 0.000 0.198 0 0 | |
# [3,] 0.200 0.001 0.200 0 0 | |
# 7b. evaluating the function instead | |
sum__ <- sum | |
benchmark(summarise(DF.cp, m.b = sum__(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.203 0.095 0.298 0 0 | |
# [2,] 0.205 0.097 0.303 0 0 | |
# [3,] 0.206 0.089 0.297 0 0 | |
# 7c. data.table way | |
setkey(DT.cp, c) | |
benchmark(DT.cp[, list(m.b=sum(b)), by=c]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.424 0.002 0.443 0 0 | |
# [2,] 0.415 0.000 0.427 0 0 | |
# [3,] 0.413 0.000 0.415 0 0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment