Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Last active December 30, 2015 13:59
Show Gist options
  • Save arunsrinivasan/7839002 to your computer and use it in GitHub Desktop.
Save arunsrinivasan/7839002 to your computer and use it in GitHub Desktop.
Benchmarking dplyr and data.table 1.8.11 commit 1048
# version 1.8.11 (commit 1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11 For help type: help("data.table")
## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT
# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)
# > length(ch)
# [1] 99982
# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
b = as.numeric(sample(rnorm(1e6), N, TRUE)),
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
d = sample(ch, N, TRUE))
DT.cp <- copy(DT)
system.time(setkey(DT.cp, c))
# user system elapsed
# 6.945 0.196 7.312
tables() # memory footprint
# NAME NROW MB COLS KEY
# [1,] DT 20,000,000 539 a,b,c,d c
# [2,] DT.cp 20,000,000 539 a,b,c,d c
# Total: 1,078MB
require(dplyr) # as of 6th December
# creating grouped_df from 'dplyr'
DF <- tbl_df(data.frame(DT))
system.time(DF.cp <- group_by(DF, c))
# user system elapsed
# 21.803 1.780 24.970
# memory footprint
print(object.size(DF), units='Mb') # 538.9 Mb
print(object.size(DF.cp), units='Mb') # 545.8 Mb
## Borrowing timing function from Hadley:
## --------------------------------------
benchmark <- function(code) {
code <- substitute(code)
rbind(
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame()))
)
}
## ----------------------------------------------------------------------------------
## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data)
## ----------------------------------------------------------------------------------
# 1a) DF vector-scan subset
benchmark(DF[DF$d == "ewdjgq", ])
# user.self sys.self elapsed user.child sys.child
# [1,] 5.592 0.276 6.015 0 0
# [2,] 5.565 0.024 6.058 0 0
# [3,] 5.591 0.021 6.003 0 0
benchmark(DF[DF$c == 169073, ])
# user.self sys.self elapsed user.child sys.child
# [1,] 3.413 0.136 4.185 0 0
# [2,] 3.434 0.119 3.642 0 0
# [3,] 3.433 0.121 3.701 0 0
# 1b) ordinary DT vector-scan subset
benchmark(DT[d == "ewdjgq"])
# user.self sys.self elapsed user.child sys.child
# [1,] 2.292 0.075 2.433 0 0
# [2,] 2.390 0.085 2.727 0 0
# [3,] 2.277 0.008 2.420 0 0
benchmark(DT[c == 169073])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.178 0.113 0.295 0 0
# [2,] 0.178 0.114 0.302 0 0
# [3,] 0.179 0.112 0.299 0 0
# 1c) dplyr's 'filter'
benchmark(filter(DF, d == "ewdjgq"))
# user.self sys.self elapsed user.child sys.child
# [1,] 2.474 0.010 2.612 0 0
# [2,] 2.447 0.006 2.504 0 0
# [3,] 2.443 0.006 2.490 0 0
benchmark(filter(DF, c == 169073))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.316 0.121 0.504 0 0
# [2,] 0.318 0.125 0.477 0 0
# [3,] 0.314 0.120 0.452 0 0
## ------------------------------------------------------------------------------
## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data)
## ------------------------------------------------------------------------------
# 1a) DF vector-scan subset
benchmark(DF.cp[DF.cp$d == "ewdjgq", ])
# user.self sys.self elapsed user.child sys.child
# [1,] 5.557 0.025 5.914 0 0
# [2,] 5.533 0.019 5.830 0 0
# [3,] 5.540 0.017 5.782 0 0
benchmark(DF.cp[DF.cp$c == 169073, ])
# user.self sys.self elapsed user.child sys.child
# [1,] 3.428 0.127 3.871 0 0
# [2,] 3.435 0.120 3.628 0 0
# [3,] 3.440 0.124 3.658 0 0
# 1b) ordinary DT vector-scan subset
benchmark(DT.cp[d == "ewdjgq"])
# user.self sys.self elapsed user.child sys.child
# [1,] 2.312 0.011 2.457 0 0
# [2,] 2.273 0.007 2.324 0 0
# [3,] 2.262 0.008 2.318 0 0
benchmark(DT.cp[c == 169073])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.179 0.120 0.307 0 0
# [2,] 0.178 0.111 0.294 0 0
# [3,] 0.177 0.111 0.292 0 0
# 1c) dplyr's 'filter'
benchmark(filter(DF.cp, d == "ewdjgq"))
# user.self sys.self elapsed user.child sys.child
# [1,] 3.515 0.040 3.752 0 0
# [2,] 3.511 0.020 3.824 0 0
# [3,] 3.436 0.012 3.516 0 0
benchmark(filter(DF.cp, c == 169073))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.525 0.181 1.746 0 0
# [2,] 1.484 0.171 1.701 0 0
# [3,] 1.451 0.171 1.686 0 0
# 1d) data.table's binary search
benchmark(DT.cp[J(169073)])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.001 0 0.002 0 0
# [2,] 0.002 0 0.002 0 0
# [3,] 0.002 0 0.002 0 0
# 1e) dplyr's join approach (doesn't use keys though):
benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c")))
# user.self sys.self elapsed user.child sys.child
# [1,] 3.219 0.264 3.571 0 0
# [2,] 3.222 0.256 3.628 0 0
# [3,] 3.225 0.253 3.576 0 0
## -------------------------------------------------------------
## 3) Comparing "arrange" (ordering) from dplyr with data.table:
## -------------------------------------------------------------
benchmark(arrange(DF.cp, b,c))
# user.self sys.self elapsed user.child sys.child
# [1,] 41.046 0.473 43.312 0 0
# [2,] 41.218 0.511 43.989 0 0
# [3,] 39.962 0.440 40.741 0 0
benchmark(setkey(copy(DT), b,c))
# user.self sys.self elapsed user.child sys.child
# [1,] 14.741 1.322 16.180 0 0
# [2,] 14.364 1.135 15.631 0 0
# [3,] 12.420 1.028 13.528 0 0
## -------------------------------------------------
## 4) Comparing "mutate" from dplyr with data.table:
## -------------------------------------------------
# The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object,
# where as ':=' (or 'set') modifies the same object by reference.
# to make the comparison fair, I'll use 'set' on a copy' everytime.
benchmark(mutate(DF.cp, e=a+b))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.456 0.294 1.761 0 0
# [2,] 1.424 0.290 1.723 0 0
# [3,] 1.413 0.281 1.696 0 0
# run this 3 times
setkey(DT.cp <- copy(DT), c)
system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b))
# user system elapsed
# 0.229 0.201 0.429
# 0.235 0.203 0.446
# 0.237 0.219 0.466
## --------------------------------------------------------------------------
## 5) Comparing "join" from dplyr with data.table: (on character column here)
## --------------------------------------------------------------------------
DF.cp <- group_by(DF, d)
setkey(DT.cp <- copy(DT), d)
set.seed(1)
DF.j <- data.frame(d = sample(ch, 1e3, FALSE), stringsAsFactors=FALSE)
DT.j <- data.table(DF.j) # no key on DT.j
benchmark(left_join(DF.j, DF.cp, by="d"))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.616 0.085 1.703 0 0
# [2,] 1.604 0.077 1.685 0 0
# [3,] 1.608 0.073 1.692 0 0
benchmark(DT.cp[DT.j])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.017 0 0.018 0 0
# [2,] 0.018 0 0.019 0 0
# [3,] 0.017 0 0.018 0 0
## ----------------------------------------------------------------------------------
## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups
## ----------------------------------------------------------------------------------
# with the groupings on character col. "d"
# 6a. with C-run function of dplyr
benchmark(summarise(DF.cp, m.b = sum(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.200 0.001 0.200 0 0
# [2,] 0.199 0.002 0.292 0 0
# [3,] 0.200 0.000 0.200 0 0
# 6b. evaluating the function instead
sum__ <- sum
benchmark(summarise(DF.cp, m.b = sum__(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.361 0.100 0.463 0 0
# [2,] 0.352 0.078 0.431 0 0
# [3,] 0.354 0.065 0.421 0 0
# 6c. data.table way
benchmark(DT.cp[, list(m.b=sum(b)), by=d])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.674 0.119 0.807 0 0
# [2,] 0.663 0.116 0.784 0 0
# [3,] 0.667 0.120 0.796 0 0
## ----------------------------------------------------------------------------------
## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups
## ----------------------------------------------------------------------------------
# with the groupings on character col. "d"
# 7a. with C-run function of dplyr
DF.cp <- group_by(DF, c)
benchmark(summarise(DF.cp, m.b = sum(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.205 0.000 0.205 0 0
# [2,] 0.199 0.001 0.199 0 0
# [3,] 0.198 0.000 0.202 0 0
# 7b. evaluating the function instead
sum__ <- sum
benchmark(summarise(DF.cp, m.b = sum__(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.601 0.074 1.693 0 0
# [2,] 1.564 0.069 1.660 0 0
# [3,] 3.226 0.078 3.397 0 0
# 7c. data.table way
setkey(DT.cp, c)
benchmark(DT.cp[, list(m.b=sum(b)), by=c])
# user.self sys.self elapsed user.child sys.child
# [1,] 1.822 0.006 1.894 0 0
# [2,] 1.817 0.005 1.846 0 0
# [3,] 1.837 0.008 1.916 0 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment