Last active
December 30, 2015 13:38
-
-
Save arunsrinivasan/7836512 to your computer and use it in GitHub Desktop.
1.8.11 (commit 1048) : Benchmark: comparison between data.table 1.8.10 and 1.8.11 commit 1048
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# version 1.8.11 (commit 1048) | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.11 For help type: help("data.table") | |
## create a huge data.table: | |
## ------------------------- | |
set.seed(1) | |
N <- 2e7 # size of DT | |
# generate a character vector of length about 1e5 | |
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="") | |
ch <- replicate(1e5, foo()) | |
ch <- unique(ch) | |
# > length(ch) | |
# [1] 99982 | |
# DT now | |
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), | |
b = as.numeric(sample(rnorm(1e6), N, TRUE)), | |
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), | |
d = sample(ch, N, TRUE)) | |
# > tables() | |
# NAME NROW MB COLS KEY | |
# [1,] DT 20,000,000 539 a,b,c,d | |
# Total: 539MB | |
## testing 'setkey' on 1 column | |
## ---------------------------- | |
# setkey on numeric columns - a,b | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a)) | |
# user system elapsed | |
# 8.097 0.414 8.599 | |
# waiting for my laptop to cool down... and running again | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, b)) | |
# user system elapsed | |
# 6.778 0.359 7.204 | |
# setkey on integer column - c | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, c)) | |
# user system elapsed | |
# 6.756 0.175 6.985 | |
# setkey on character column - d | |
# should be *very* quick | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, d)) | |
# user system elapsed | |
# 10.152 0.225 11.438 | |
## testing 'setkey' on 2 columns | |
## ----------------------------- | |
# setkey on numeric columns - a,b | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a,b)) | |
# user system elapsed | |
# 15.289 1.131 16.934 | |
# setkey on integer+numeric columns - c,a | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, c,a)) | |
# user system elapsed | |
# 13.037 0.891 15.278 | |
# setkey on character+numeric columns - d,a | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, d,a)) | |
# user system elapsed | |
# 16.948 0.743 18.824 | |
# setkey on numeric+integer columns - a,c | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a,c)) | |
# user system elapsed | |
# 12.135 0.805 13.848 | |
# setkey on numeric+character columns - a,d | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a,d)) | |
# user system elapsed | |
# 15.078 0.745 16.425 | |
# setkey on integer+character columns - c,d | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, c,d)) | |
# user system elapsed | |
# 14.840 0.625 17.685 | |
# setkey on character+integer columns - d,c | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, d,c)) | |
# user system elapsed | |
# 16.275 0.505 17.397 | |
## Other tests which are not by reference so that we can run them more than once... | |
## -------------------------------------------------------------------------------- | |
## Borrowing timing function from Hadley | |
benchmark <- function(code) { | |
code <- substitute(code) | |
rbind( | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())) | |
) | |
} | |
# filtering/subsetting : vector-scan approach - without key | |
benchmark(DT[d == "pvuyrlxw"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 2.295 0.077 2.437 0 0 | |
# [2,] 2.276 0.079 2.402 0 0 | |
# [3,] 2.324 0.014 2.552 0 0 | |
# filtering/subsetting : vector-scan approach - with key | |
DT.cp <- copy(DT) | |
setkey(DT.cp, d) | |
benchmark(DT.cp[d == "pvuyrlxw"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.734 0.004 0.817 0 0 | |
# [2,] 0.742 0.002 0.780 0 0 | |
# [3,] 0.737 0.003 0.773 0 0 | |
# 1.8.11 is about 3 times faster in vector-scan approach than 1.8.10 (see https://gist.github.com/arunsrinivasan/7832436) | |
# Interesting : setting key helps in vector-scanning... >2x speed-up | |
# binary search approach | |
benchmark(DT.cp["pvuyrlxw"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.002 0 0.002 0 0 | |
# [2,] 0.003 0 0.002 0 0 | |
# [3,] 0.002 0 0.003 0 0 | |
# summarising : without key - column "c" - for simplicity | |
benchmark(DT[, mean(b), by=c]) ########################### 900000 groups ###################### | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 8.127 0.344 9.224 0 0 | |
# [2,] 8.053 0.457 8.730 0 0 | |
# [3,] 8.047 0.458 8.719 0 0 | |
# grouping is about 5 times faster in 1.8.11 than in 1.8.10! with no key | |
# summarising : with key - column "c" - for simplicity | |
DT.cp <- copy(DT) | |
setkey(DT.cp, c) # <~~~ 7 seconds | |
benchmark(DT.cp[, mean(b), by=c]) ########################### 900000 groups ###################### | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.786 0.011 1.987 0 0 | |
# [2,] 1.773 0.009 1.831 0 0 | |
# [3,] 1.776 0.014 2.152 0 0 | |
## testing aggregation with `dplyr` (other tests will follow later) | |
## ---------------------------------------------------------------- | |
require(dplyr) | |
DF <- tbl_df(data.frame(DT)) | |
system.time(DF <- group_by(DF, c)) | |
# user system elapsed | |
# 21.532 1.550 23.896 | |
benchmark(summarise(DF, mean(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.648 0.007 0.675 0 0 | |
# [2,] 0.641 0.002 0.680 0 0 | |
# [3,] 0.638 0.002 0.662 0 0 | |
# Note: here "mean(b)" is run entirely in C (Very clever implementation from Romain). | |
# And it seems to be 3x faster. If we were to replace this function with another | |
# function that's not implemented in C, but simple enough, then: | |
sum__ <- sum | |
benchmark(summarise(DF, sum__(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.540 0.163 1.813 0 0 | |
# [2,] 1.536 0.153 1.746 0 0 | |
# [3,] 1.530 0.159 1.728 0 0 | |
# doing the same on data.table to compare - sum is not optimised in DT - and it's a primitive. | |
benchmark(DT.cp[, sum(b), by=c]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.446 0.016 1.561 0 0 | |
# [2,] 1.431 0.007 1.505 0 0 | |
# [3,] 1.436 0.007 1.483 0 0 | |
# Seems like `data.table` marginally edges over the (current) version of `dplyr`. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment