Last active
December 30, 2015 14:09
-
-
Save arunsrinivasan/7839891 to your computer and use it in GitHub Desktop.
Script used to generate results for CologneR user group meet (for reproducibility)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(reshape2) | |
# data.table commit (1048) | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.11 For help type: help("data.table") | |
set.seed(1) | |
N <- 2e7 # size of DT | |
# generate a character vector of length about 1e5 | |
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="") | |
ch <- replicate(1e5, foo()) | |
ch <- unique(ch) | |
# > length(ch) | |
# [1] 99982 | |
# DT now | |
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), | |
b = as.numeric(sample(rnorm(1e6), N, TRUE)), | |
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), | |
d = sample(ch, N, TRUE)) | |
tables() | |
# NAME NROW MB COLS KEY | |
# [1,] DT 20,000,000 539 a,b,c,d | |
# Total: 539MB | |
# timing (run 3 times) - function borrowed from Hadley. | |
benchmark <- function(code) { | |
code <- substitute(code) | |
rbind( | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())) | |
) | |
} | |
# r-session memory usage - 1GB | |
## MELT | |
## ---- | |
benchmark(melt(DT, id="d", measure=1:2)) | |
user.self sys.self elapsed user.child sys.child | |
[1,] 2.885 0.595 3.554 0 0 | |
[2,] 1.898 0.579 2.516 0 0 | |
[3,] 1.894 0.562 2.492 0 0 | |
# compare against reshape2 | |
benchmark(reshape2:::melt.data.frame(DT, id="d", measure=1:2)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 227.841 11.725 304.533 0 0 | |
# [2,] 166.293 10.032 190.056 0 0 | |
# [3,] 170.237 10.364 195.621 0 0 | |
###################################################################### | |
# max memory used (mem footprint) - DT (1.7GB) vs reshape2 (4.7GB) | |
# Speedup of ~ 75x!!!! | |
###################################################################### | |
## CASTING | |
## ------- | |
# add a new column (showcase new feature in v1.8.11 for 'set') | |
smple <- sample(letters[1:10], 2e7, TRUE) | |
system.time(set(DT, i=NULL, j="e", value=smple)) # new feature in 1.8.11 - adding new column using set | |
# user system elapsed | |
# 0.108 0.116 0.240 | |
benchmark(dcast.data.table(DT, d ~ e, value.var="b", fun=sum)) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 14.253 1.011 15.953 0 0 | |
# [2,] 14.149 1.002 15.648 0 0 | |
# [3,] 14.256 1.031 15.934 0 0 | |
############################################################################################################# | |
# NOTE: We can't run this on current `reshape2` as it segfaults - because of function `split-numerics.cpp` | |
# in 'plyr'. I made the changes locally (from Hadley's email) and tested 'reshape2'. Here's the benchmarking | |
# result. I ran it only once as it takes quite a long time. | |
############################################################################################################# | |
# If you're interested in trying this, go here: http://gallery.rcpp.org/articles/plyr-c-to-rcpp/ | |
# Copy the C++ code and replace the function in src/split-numerics.c of "plyr" | |
# You may also have to modify `plyr_split_indices` from RcppExports.cpp as well: | |
# here's the code: https://github.com/hadley/dplyr/blob/master/src/RcppExports.cpp | |
# Replace "dplyr" accordingly. Then compile the package. | |
system.time(out1 <- dcast(DT, d ~ e, value.var="b", fun=sum)) | |
# user system elapsed | |
# 41.697 4.941 46.887 | |
out2 <- dcast.data.table(DT, d ~ e, value.var="b", fun=sum) | |
out2.df <- as.data.frame(out2) | |
setnames(out2.df, names(out1)) # set names same as out1 | |
identical(out1, out2.df) # [1] TRUE | |
############################################################### | |
# max memory usage: 1.8GB (data.table) vs 1.8GB (reshape2) | |
# dcast.data.table is faster than dcast by ~ > 3x times here. | |
############################################################### | |
## more columns on the LHS | |
set(DT, i=NULL, j="f", value=sample(10, 2e7, TRUE)) | |
system.time(out1 <- dcast(DT, f+d ~ e, value.var="b", fun=length)) | |
# user system elapsed | |
# 174.912 8.143 184.422 | |
system.time(out2 <- dcast.data.table(DT, f+d ~ e, value.var="b", fun=length)) | |
# user system elapsed | |
# 25.253 2.249 28.430 | |
out2.df <- as.data.frame(out2) | |
setnames(out2.df, names(out1)) # set names same as out1 | |
identical(out1, out2.df) # [1] TRUE | |
############################################################### | |
# max memory usage: 2.98GB (data.table) vs 3.38GB (reshape2) | |
# dcast.data.table is faster than dcast by ~ > 6.5x times here. | |
############################################################### | |
# other relevant benchmarks: | |
# https://gist.github.com/arunsrinivasan/7836512 - benchmarks on (mostly) "setkey" for 1.8.11 | |
# https://gist.github.com/arunsrinivasan/7832436 - benchmarks on (mostly) "setkey" for 1.8.10 | |
# https://gist.github.com/arunsrinivasan/7839002 - dplyr vs data.table 1.8.11 commit 1048 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@ramnathv Roughly speaking, in large data benchmarks often the first run takes longer due to cache effects (e.g. if the data is moved from RAM into L2 by the first run, the rest will run faster, depending if the data is larger than L2 cache, hardware, etc). iiuc, microbenchmark is geared towards repeating a task 100 times and nicely reporting the distribution of times? In large data benchmarks though, a few runs taking 2-3 minutes (as above) is more usual and you're interested in the difference between the first run and the rest, perhaps. As far as I know, microbenchmark doesn't report the time of the first run but I know you can change replications from 100 to 3. Its name "micro" suggests it's geared for timing the overhead of calling a task repetitively in a loop which is appropriate if that's what you need to do in practice?