Last active
December 30, 2015 13:59
-
-
Save arunsrinivasan/7839002 to your computer and use it in GitHub Desktop.
Benchmarking dplyr and data.table 1.8.11 commit 1048
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# version 1.8.11 (commit 1048) | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.11 For help type: help("data.table") | |
## create a huge data.table: | |
## ------------------------- | |
set.seed(1) | |
N <- 2e7 # size of DT | |
# generate a character vector of length about 1e5 | |
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="") | |
ch <- replicate(1e5, foo()) | |
ch <- unique(ch) | |
# > length(ch) | |
# [1] 99982 | |
# DT now | |
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), | |
b = as.numeric(sample(rnorm(1e6), N, TRUE)), | |
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), | |
d = sample(ch, N, TRUE)) | |
DT.cp <- copy(DT) | |
system.time(setkey(DT.cp, c)) | |
# user system elapsed | |
# 6.945 0.196 7.312 | |
tables() # memory footprint | |
# NAME NROW MB COLS KEY | |
# [1,] DT 20,000,000 539 a,b,c,d c | |
# [2,] DT.cp 20,000,000 539 a,b,c,d c | |
# Total: 1,078MB | |
require(dplyr) # as of 6th December | |
# creating grouped_df from 'dplyr' | |
DF <- tbl_df(data.frame(DT)) | |
system.time(DF.cp <- group_by(DF, c)) | |
# user system elapsed | |
# 21.803 1.780 24.970 | |
# memory footprint | |
print(object.size(DF), units='Mb') # 538.9 Mb | |
print(object.size(DF.cp), units='Mb') # 545.8 Mb | |
## Borrowing timing function from Hadley: | |
## -------------------------------------- | |
benchmark <- function(code) { | |
code <- substitute(code) | |
rbind( | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())) | |
) | |
} | |
## ---------------------------------------------------------------------------------- | |
## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data) | |
## ---------------------------------------------------------------------------------- | |
# 1a) DF vector-scan subset | |
benchmark(DF[DF$d == "ewdjgq", ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 5.592 0.276 6.015 0 0 | |
# [2,] 5.565 0.024 6.058 0 0 | |
# [3,] 5.591 0.021 6.003 0 0 | |
benchmark(DF[DF$c == 169073, ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 3.413 0.136 4.185 0 0 | |
# [2,] 3.434 0.119 3.642 0 0 | |
# [3,] 3.433 0.121 3.701 0 0 | |
# 1b) ordinary DT vector-scan subset | |
benchmark(DT[d == "ewdjgq"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 2.292 0.075 2.433 0 0 | |
# [2,] 2.390 0.085 2.727 0 0 | |
# [3,] 2.277 0.008 2.420 0 0 | |
benchmark(DT[c == 169073]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.178 0.113 0.295 0 0 | |
# [2,] 0.178 0.114 0.302 0 0 | |
# [3,] 0.179 0.112 0.299 0 0 | |
# 1c) dplyr's 'filter' | |
benchmark(filter(DF, d == "ewdjgq")) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 2.474 0.010 2.612 0 0 | |
# [2,] 2.447 0.006 2.504 0 0 | |
# [3,] 2.443 0.006 2.490 0 0 | |
benchmark(filter(DF, c == 169073)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.316 0.121 0.504 0 0 | |
# [2,] 0.318 0.125 0.477 0 0 | |
# [3,] 0.314 0.120 0.452 0 0 | |
## ------------------------------------------------------------------------------ | |
## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data) | |
## ------------------------------------------------------------------------------ | |
# 1a) DF vector-scan subset | |
benchmark(DF.cp[DF.cp$d == "ewdjgq", ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 5.557 0.025 5.914 0 0 | |
# [2,] 5.533 0.019 5.830 0 0 | |
# [3,] 5.540 0.017 5.782 0 0 | |
benchmark(DF.cp[DF.cp$c == 169073, ]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 3.428 0.127 3.871 0 0 | |
# [2,] 3.435 0.120 3.628 0 0 | |
# [3,] 3.440 0.124 3.658 0 0 | |
# 1b) ordinary DT vector-scan subset | |
benchmark(DT.cp[d == "ewdjgq"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 2.312 0.011 2.457 0 0 | |
# [2,] 2.273 0.007 2.324 0 0 | |
# [3,] 2.262 0.008 2.318 0 0 | |
benchmark(DT.cp[c == 169073]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.179 0.120 0.307 0 0 | |
# [2,] 0.178 0.111 0.294 0 0 | |
# [3,] 0.177 0.111 0.292 0 0 | |
# 1c) dplyr's 'filter' | |
benchmark(filter(DF.cp, d == "ewdjgq")) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 3.515 0.040 3.752 0 0 | |
# [2,] 3.511 0.020 3.824 0 0 | |
# [3,] 3.436 0.012 3.516 0 0 | |
benchmark(filter(DF.cp, c == 169073)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.525 0.181 1.746 0 0 | |
# [2,] 1.484 0.171 1.701 0 0 | |
# [3,] 1.451 0.171 1.686 0 0 | |
# 1d) data.table's binary search | |
benchmark(DT.cp[J(169073)]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.001 0 0.002 0 0 | |
# [2,] 0.002 0 0.002 0 0 | |
# [3,] 0.002 0 0.002 0 0 | |
# 1e) dplyr's join approach (doesn't use keys though): | |
benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c"))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 3.219 0.264 3.571 0 0 | |
# [2,] 3.222 0.256 3.628 0 0 | |
# [3,] 3.225 0.253 3.576 0 0 | |
## ------------------------------------------------------------- | |
## 3) Comparing "arrange" (ordering) from dplyr with data.table: | |
## ------------------------------------------------------------- | |
benchmark(arrange(DF.cp, b,c)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 41.046 0.473 43.312 0 0 | |
# [2,] 41.218 0.511 43.989 0 0 | |
# [3,] 39.962 0.440 40.741 0 0 | |
benchmark(setkey(copy(DT), b,c)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 14.741 1.322 16.180 0 0 | |
# [2,] 14.364 1.135 15.631 0 0 | |
# [3,] 12.420 1.028 13.528 0 0 | |
## ------------------------------------------------- | |
## 4) Comparing "mutate" from dplyr with data.table: | |
## ------------------------------------------------- | |
# The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object, | |
# where as ':=' (or 'set') modifies the same object by reference. | |
# to make the comparison fair, I'll use 'set' on a copy' everytime. | |
benchmark(mutate(DF.cp, e=a+b)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.456 0.294 1.761 0 0 | |
# [2,] 1.424 0.290 1.723 0 0 | |
# [3,] 1.413 0.281 1.696 0 0 | |
# run this 3 times | |
setkey(DT.cp <- copy(DT), c) | |
system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b)) | |
# user system elapsed | |
# 0.229 0.201 0.429 | |
# 0.235 0.203 0.446 | |
# 0.237 0.219 0.466 | |
## -------------------------------------------------------------------------- | |
## 5) Comparing "join" from dplyr with data.table: (on character column here) | |
## -------------------------------------------------------------------------- | |
DF.cp <- group_by(DF, d) | |
setkey(DT.cp <- copy(DT), d) | |
set.seed(1) | |
DF.j <- data.frame(d = sample(ch, 1e3, FALSE), stringsAsFactors=FALSE) | |
DT.j <- data.table(DF.j) # no key on DT.j | |
benchmark(left_join(DF.j, DF.cp, by="d")) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.616 0.085 1.703 0 0 | |
# [2,] 1.604 0.077 1.685 0 0 | |
# [3,] 1.608 0.073 1.692 0 0 | |
benchmark(DT.cp[DT.j]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.017 0 0.018 0 0 | |
# [2,] 0.018 0 0.019 0 0 | |
# [3,] 0.017 0 0.018 0 0 | |
## ---------------------------------------------------------------------------------- | |
## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups | |
## ---------------------------------------------------------------------------------- | |
# with the groupings on character col. "d" | |
# 6a. with C-run function of dplyr | |
benchmark(summarise(DF.cp, m.b = sum(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.200 0.001 0.200 0 0 | |
# [2,] 0.199 0.002 0.292 0 0 | |
# [3,] 0.200 0.000 0.200 0 0 | |
# 6b. evaluating the function instead | |
sum__ <- sum | |
benchmark(summarise(DF.cp, m.b = sum__(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.361 0.100 0.463 0 0 | |
# [2,] 0.352 0.078 0.431 0 0 | |
# [3,] 0.354 0.065 0.421 0 0 | |
# 6c. data.table way | |
benchmark(DT.cp[, list(m.b=sum(b)), by=d]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.674 0.119 0.807 0 0 | |
# [2,] 0.663 0.116 0.784 0 0 | |
# [3,] 0.667 0.120 0.796 0 0 | |
## ---------------------------------------------------------------------------------- | |
## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups | |
## ---------------------------------------------------------------------------------- | |
# with the groupings on character col. "d" | |
# 7a. with C-run function of dplyr | |
DF.cp <- group_by(DF, c) | |
benchmark(summarise(DF.cp, m.b = sum(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.205 0.000 0.205 0 0 | |
# [2,] 0.199 0.001 0.199 0 0 | |
# [3,] 0.198 0.000 0.202 0 0 | |
# 7b. evaluating the function instead | |
sum__ <- sum | |
benchmark(summarise(DF.cp, m.b = sum__(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.601 0.074 1.693 0 0 | |
# [2,] 1.564 0.069 1.660 0 0 | |
# [3,] 3.226 0.078 3.397 0 0 | |
# 7c. data.table way | |
setkey(DT.cp, c) | |
benchmark(DT.cp[, list(m.b=sum(b)), by=c]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.822 0.006 1.894 0 0 | |
# [2,] 1.817 0.005 1.846 0 0 | |
# [3,] 1.837 0.008 1.916 0 0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment