Last active
October 25, 2018 18:24
-
-
Save Laurae2/0e9b6a3cdf0b5480e12c8856e7fb10ca to your computer and use it in GitHub Desktop.
data.table benchmark 2 billions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# library(data.table) | |
# N=2e9; K=100 | |
# set.seed(1) | |
# DT <- data.table( | |
# id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) | |
# id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) | |
# id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) | |
# id4 = sample(K, N, TRUE), # large groups (int) | |
# id5 = sample(K, N, TRUE), # large groups (int) | |
# id6 = sample(N/K, N, TRUE), # small groups (int) | |
# v1 = sample(5, N, TRUE), # int in range [1,5] | |
# v2 = sample(5, N, TRUE), # int in range [1,5] | |
# v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 | |
# ) | |
# cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") | |
# system.time( DT[, sum(v1), keyby=id1] ) | |
# system.time( DT[, sum(v1), keyby=id1] ) | |
# system.time( DT[, sum(v1), keyby="id1,id2"] ) | |
# system.time( DT[, sum(v1), keyby="id1,id2"] ) | |
# system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] ) | |
# system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] ) | |
# system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] ) | |
# system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] ) | |
# system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] ) | |
# system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] ) | |
R version 3.5.0 (2018-04-23) -- "Joy in Playing" | |
Copyright (C) 2018 The R Foundation for Statistical Computing | |
Platform: x86_64-pc-linux-gnu (64-bit) | |
R is free software and comes with ABSOLUTELY NO WARRANTY. | |
You are welcome to redistribute it under certain conditions. | |
Type 'license()' or 'licence()' for distribution details. | |
Natural language support but running in an English locale | |
R is a collaborative project with many contributors. | |
Type 'contributors()' for more information and | |
'citation()' on how to cite R or R packages in publications. | |
Type 'demo()' for some demos, 'help()' for on-line help, or | |
'help.start()' for an HTML browser interface to help. | |
Type 'q()' to quit R. | |
# Dual Xeon 6130, 32 cores / 64 threads @3.7/2.8 GHz | |
> library(data.table) | |
data.table 1.11.8 Latest news: r-datatable.com | |
> N=2e9; K=100 | |
> set.seed(1) | |
> DT <- data.table( | |
+ id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) | |
+ id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) | |
+ id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) | |
+ id4 = sample(K, N, TRUE), # large groups (int) | |
+ id5 = sample(K, N, TRUE), # large groups (int) | |
+ id6 = sample(N/K, N, TRUE), # small groups (int) | |
+ v1 = sample(5, N, TRUE), # int in range [1,5] | |
+ v2 = sample(5, N, TRUE), # int in range [1,5] | |
+ v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 | |
+ ) | |
> cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") | |
GB = 98.359 | |
> system.time( DT[, sum(v1), keyby=id1] ) | |
user system elapsed | |
76.547 18.953 95.518 | |
> system.time( DT[, sum(v1), keyby=id1] ) | |
user system elapsed | |
75.859 17.281 93.134 | |
> system.time( DT[, sum(v1), keyby="id1,id2"] ) | |
user system elapsed | |
185.609 17.047 202.638 | |
> system.time( DT[, sum(v1), keyby="id1,id2"] ) | |
user system elapsed | |
186.125 18.062 204.194 | |
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] ) | |
user system elapsed | |
433.578 35.891 469.432 | |
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] ) | |
user system elapsed | |
433.031 34.219 467.235 | |
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] ) | |
user system elapsed | |
77.125 13.344 89.658 | |
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] ) | |
user system elapsed | |
77.438 13.297 90.337 | |
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] ) | |
user system elapsed | |
276.328 29.796 306.104 | |
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] ) | |
user system elapsed | |
276.094 31.766 307.860 | |
> sessionInfo() | |
R version 3.5.0 (2018-04-23) | |
Platform: x86_64-pc-linux-gnu (64-bit) | |
Running under: Ubuntu 16.04.4 LTS | |
Matrix products: default | |
BLAS: /usr/local/lib/R/lib/libRblas.so | |
LAPACK: /usr/local/lib/R/lib/libRlapack.so | |
locale: | |
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 | |
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C | |
attached base packages: | |
[1] stats graphics grDevices utils datasets methods base | |
other attached packages: | |
[1] data.table_1.11.8 | |
loaded via a namespace (and not attached): | |
[1] compiler_3.5.0 tools_3.5.0 yaml_2.1.19 | |
# Dual Xeon 6154, 36 cores / 72 threads @3.7/3.7 GHz | |
> library(data.table) | |
data.table 1.11.8 Latest news: r-datatable.com | |
> N=2e9; K=100 | |
> set.seed(1) | |
> DT <- data.table( | |
+ id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) | |
+ id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) | |
+ id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) | |
+ id4 = sample(K, N, TRUE), # large groups (int) | |
+ id5 = sample(K, N, TRUE), # large groups (int) | |
+ id6 = sample(N/K, N, TRUE), # small groups (int) | |
+ v1 = sample(5, N, TRUE), # int in range [1,5] | |
+ v2 = sample(5, N, TRUE), # int in range [1,5] | |
+ v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 | |
+ ) | |
> cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") | |
GB = 98.359 | |
> system.time( DT[, sum(v1), keyby=id1] ) | |
user system elapsed | |
51.727 16.908 68.624 | |
> system.time( DT[, sum(v1), keyby=id1] ) | |
user system elapsed | |
50.534 13.117 63.641 | |
> system.time( DT[, sum(v1), keyby="id1,id2"] ) | |
user system elapsed | |
131.590 22.851 154.409 | |
> system.time( DT[, sum(v1), keyby="id1,id2"] ) | |
user system elapsed | |
130.969 20.963 151.900 | |
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] ) | |
user system elapsed | |
339.665 50.357 389.927 | |
> system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] ) | |
user system elapsed | |
339.569 42.565 382.041 | |
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] ) | |
user system elapsed | |
49.583 19.953 69.292 | |
> system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] ) | |
user system elapsed | |
49.299 13.689 62.820 | |
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] ) | |
user system elapsed | |
217.549 35.290 252.762 | |
> system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] ) | |
user system elapsed | |
218.605 32.129 250.658 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment