Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Last active December 30, 2015 13:38
Show Gist options
  • Save arunsrinivasan/7836512 to your computer and use it in GitHub Desktop.
Save arunsrinivasan/7836512 to your computer and use it in GitHub Desktop.
1.8.11 (commit 1048) : Benchmark: comparison between data.table 1.8.10 and 1.8.11 commit 1048
# version 1.8.11 (commit 1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11 For help type: help("data.table")
## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT
# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)
# > length(ch)
# [1] 99982
# DT now
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
b = as.numeric(sample(rnorm(1e6), N, TRUE)),
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
d = sample(ch, N, TRUE))
# > tables()
# NAME NROW MB COLS KEY
# [1,] DT 20,000,000 539 a,b,c,d
# Total: 539MB
## testing 'setkey' on 1 column
## ----------------------------
# setkey on numeric columns - a,b
DT.cp = copy(DT)
system.time(setkey(DT.cp, a))
# user system elapsed
# 8.097 0.414 8.599
# waiting for my laptop to cool down... and running again
DT.cp = copy(DT)
system.time(setkey(DT.cp, b))
# user system elapsed
# 6.778 0.359 7.204
# setkey on integer column - c
DT.cp = copy(DT)
system.time(setkey(DT.cp, c))
# user system elapsed
# 6.756 0.175 6.985
# setkey on character column - d
# should be *very* quick
DT.cp = copy(DT)
system.time(setkey(DT.cp, d))
# user system elapsed
# 10.152 0.225 11.438
## testing 'setkey' on 2 columns
## -----------------------------
# setkey on numeric columns - a,b
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,b))
# user system elapsed
# 15.289 1.131 16.934
# setkey on integer+numeric columns - c,a
DT.cp = copy(DT)
system.time(setkey(DT.cp, c,a))
# user system elapsed
# 13.037 0.891 15.278
# setkey on character+numeric columns - d,a
DT.cp = copy(DT)
system.time(setkey(DT.cp, d,a))
# user system elapsed
# 16.948 0.743 18.824
# setkey on numeric+integer columns - a,c
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,c))
# user system elapsed
# 12.135 0.805 13.848
# setkey on numeric+character columns - a,d
DT.cp = copy(DT)
system.time(setkey(DT.cp, a,d))
# user system elapsed
# 15.078 0.745 16.425
# setkey on integer+character columns - c,d
DT.cp = copy(DT)
system.time(setkey(DT.cp, c,d))
# user system elapsed
# 14.840 0.625 17.685
# setkey on character+integer columns - d,c
DT.cp = copy(DT)
system.time(setkey(DT.cp, d,c))
# user system elapsed
# 16.275 0.505 17.397
## Other tests which are not by reference so that we can run them more than once...
## --------------------------------------------------------------------------------
## Borrowing timing function from Hadley
benchmark <- function(code) {
code <- substitute(code)
rbind(
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame())),
system.time(eval(code, parent.frame()))
)
}
# filtering/subsetting : vector-scan approach - without key
benchmark(DT[d == "pvuyrlxw"])
# user.self sys.self elapsed user.child sys.child
# [1,] 2.295 0.077 2.437 0 0
# [2,] 2.276 0.079 2.402 0 0
# [3,] 2.324 0.014 2.552 0 0
# filtering/subsetting : vector-scan approach - with key
DT.cp <- copy(DT)
setkey(DT.cp, d)
benchmark(DT.cp[d == "pvuyrlxw"])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.734 0.004 0.817 0 0
# [2,] 0.742 0.002 0.780 0 0
# [3,] 0.737 0.003 0.773 0 0
# 1.8.11 is about 3 times faster in vector-scan approach than 1.8.10 (see https://gist.github.com/arunsrinivasan/7832436)
# Interesting : setting key helps in vector-scanning... >2x speed-up
# binary search approach
benchmark(DT.cp["pvuyrlxw"])
# user.self sys.self elapsed user.child sys.child
# [1,] 0.002 0 0.002 0 0
# [2,] 0.003 0 0.002 0 0
# [3,] 0.002 0 0.003 0 0
# summarising : without key - column "c" - for simplicity
benchmark(DT[, mean(b), by=c]) ########################### 900000 groups ######################
# user.self sys.self elapsed user.child sys.child
# [1,] 8.127 0.344 9.224 0 0
# [2,] 8.053 0.457 8.730 0 0
# [3,] 8.047 0.458 8.719 0 0
# grouping is about 5 times faster in 1.8.11 than in 1.8.10! with no key
# summarising : with key - column "c" - for simplicity
DT.cp <- copy(DT)
setkey(DT.cp, c) # <~~~ 7 seconds
benchmark(DT.cp[, mean(b), by=c]) ########################### 900000 groups ######################
# user.self sys.self elapsed user.child sys.child
# [1,] 1.786 0.011 1.987 0 0
# [2,] 1.773 0.009 1.831 0 0
# [3,] 1.776 0.014 2.152 0 0
## testing aggregation with `dplyr` (other tests will follow later)
## ----------------------------------------------------------------
require(dplyr)
DF <- tbl_df(data.frame(DT))
system.time(DF <- group_by(DF, c))
# user system elapsed
# 21.532 1.550 23.896
benchmark(summarise(DF, mean(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 0.648 0.007 0.675 0 0
# [2,] 0.641 0.002 0.680 0 0
# [3,] 0.638 0.002 0.662 0 0
# Note: here "mean(b)" is run entirely in C (Very clever implementation from Romain).
# And it seems to be 3x faster. If we were to replace this function with another
# function that's not implemented in C, but simple enough, then:
sum__ <- sum
benchmark(summarise(DF, sum__(b)))
# user.self sys.self elapsed user.child sys.child
# [1,] 1.540 0.163 1.813 0 0
# [2,] 1.536 0.153 1.746 0 0
# [3,] 1.530 0.159 1.728 0 0
# doing the same on data.table to compare - sum is not optimised in DT - and it's a primitive.
benchmark(DT.cp[, sum(b), by=c])
# user.self sys.self elapsed user.child sys.child
# [1,] 1.446 0.016 1.561 0 0
# [2,] 1.431 0.007 1.505 0 0
# [3,] 1.436 0.007 1.483 0 0
# Seems like `data.table` marginally edges over the (current) version of `dplyr`.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment