Last active
December 30, 2015 12:59
-
-
Save arunsrinivasan/7832436 to your computer and use it in GitHub Desktop.
1.8.10 : Benchmark: comparison between data.table 1.8.10 and 1.8.11 commit 1048
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# version 1.8.10 | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.10 For help type: help("data.table") | |
## create a huge data.table: | |
## ------------------------- | |
set.seed(1) | |
N <- 2e7 # size of DT | |
# generate a character vector of length about 1e5 | |
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="") | |
ch <- replicate(1e5, foo()) | |
ch <- unique(ch) | |
# > length(ch) | |
# [1] 99982 | |
# DT now | |
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), | |
b = as.numeric(sample(rnorm(1e6), N, TRUE)), | |
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), | |
d = sample(ch, N, TRUE)) | |
# > tables() | |
# NAME NROW MB COLS KEY | |
# [1,] DT 20,000,000 539 a,b,c,d | |
# Total: 539MB | |
## testing 'setkey' on 1 column | |
## ---------------------------- | |
# setkey on numeric columns - a,b | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a)) | |
# user system elapsed | |
# 51.491 0.426 53.457 | |
# waiting for my laptop to cool down... and running again | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, b)) | |
# user system elapsed | |
# 51.030 0.353 54.953 | |
# setkey on integer column - c | |
# R's base radix sort won't help here.. will resort to regularorder1 | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, c)) | |
# user system elapsed | |
# 44.500 0.368 48.012 | |
# setkey on character column - d | |
# should be *very* quick | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, d)) | |
# user system elapsed | |
# 9.451 0.178 9.706 | |
## testing 'setkey' on 2 columns | |
## ----------------------------- | |
# setkey on numeric columns - a,b | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a,b)) | |
# user system elapsed | |
# 98.412 0.736 102.319 | |
# setkey on integer+numeric columns - c,a | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, c,a)) | |
# user system elapsed | |
# 90.407 0.507 91.512 | |
# setkey on character+numeric columns - d,a | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, d,a)) | |
# user system elapsed | |
# 58.047 0.414 59.491 | |
# setkey on numeric+integer columns - a,c | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a,c)) | |
# user system elapsed | |
# 92.641 0.872 100.080 | |
# setkey on numeric+character columns - a,d | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, a,d)) | |
# user system elapsed | |
# 58.607 0.602 63.171 | |
# setkey on integer+character columns - c,d | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, c,d)) | |
# user system elapsed | |
# 52.147 0.483 55.035 | |
# setkey on character+integer columns - d,c | |
DT.cp = copy(DT) | |
system.time(setkey(DT.cp, d,c)) | |
# user system elapsed | |
# 53.119 0.659 57.779 | |
## Other tests which are not by reference so that we can run them more than once... | |
## -------------------------------------------------------------------------------- | |
## Borrowing timing function from Hadley | |
benchmark <- function(code) { | |
code <- substitute(code) | |
rbind( | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())) | |
) | |
} | |
# filtering/subsetting : vector-scan approach - without key | |
benchmark(DT[d == "pvuyrlxw"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 6.018 0.329 6.625 0 0 | |
# [2,] 5.939 0.091 6.243 0 0 | |
# [3,] 5.900 0.019 6.169 0 0 | |
# filtering/subsetting : vector-scan approach - with key | |
DT.cp <- copy(DT) | |
setkey(DT.cp, d) | |
benchmark(DT.cp[d == "pvuyrlxw"]) | |
user.self sys.self elapsed user.child sys.child | |
# [1,] 2.803 0.009 2.965 0 0 | |
# [2,] 2.790 0.010 2.978 0 0 | |
# [3,] 2.794 0.009 2.897 0 0 | |
# Interesting : setting key helps in vector-scanning... >2x speed-up | |
# binary search approach | |
benchmark(DT.cp["pvuyrlxw"]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.002 0 0.002 0 0 | |
# [2,] 0.002 0 0.002 0 0 | |
# [3,] 0.002 0 0.002 0 0 | |
# summarising : without key - column "c" - for simplicity | |
benchmark(DT[, mean(b), by=c]) ########################### 900000 groups ###################### | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 45.167 0.357 47.287 0 0 | |
# [2,] 45.148 0.444 47.796 0 0 | |
# [3,] 45.971 0.435 48.342 0 0 | |
# summarising : with key - column "c" - for simplicity | |
DT.cp <- copy(DT) | |
setkey(DT.cp, c) | |
benchmark(DT.cp[, mean(b), by=c]) ########################### 900000 groups ###################### | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.800 0.034 1.887 0 0 | |
# [2,] 2.615 0.035 2.790 0 0 | |
# [3,] 1.804 0.030 1.949 0 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment