Created
December 17, 2013 00:03
-
-
Save arunsrinivasan/7997521 to your computer and use it in GitHub Desktop.
A small comparison between 'dplyr' and 'data.table'
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# version 1.8.11 | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.11 For help type: help("data.table") | |
## create a huge data.table: | |
## ------------------------- | |
set.seed(1) | |
N <- 2e7 # size of DT | |
# generate a character vector of length about 1e5 | |
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="") | |
ch <- replicate(1e5, foo()) | |
ch <- unique(ch) | |
# > length(ch) | |
# [1] 99982 | |
# DT now | |
DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), | |
b = as.numeric(sample(rnorm(1e6), N, TRUE)), | |
c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), | |
d = sample(ch, N, TRUE)) | |
## setkey on data.table v 1.8.11 | |
## ---------------------------- | |
DT.cp <- copy(DT) | |
system.time(setkey(DT.cp, c)) | |
# user system elapsed | |
# 6.945 0.196 7.312 | |
## equivalent of setkey in dplyr (group_by) - as of december 6th | |
## ------------------------------------------------------------ | |
require(dplyr) | |
# creating grouped_df from 'dplyr' | |
DF <- tbl_df(data.frame(DT)) | |
system.time(DF.cp <- group_by(DF, c)) | |
# user system elapsed | |
# 21.803 1.780 24.970 | |
## setkey on data.table takes 7.3 seconds where as group_by on dplyr takes 24.97 seconds! | |
## dplyr requires group_by to be able to "summarise" data. Benchmarks for summarise are shown below. | |
## Borrowing timing function from Hadley: | |
## -------------------------------------- | |
benchmark <- function(code) { | |
code <- substitute(code) | |
rbind( | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())), | |
system.time(eval(code, parent.frame())) | |
) | |
} | |
## ------------------------------------------------- | |
## Comparing "summarise" from dplyr with data.table: - ~9e5 unique groups | |
## ------------------------------------------------- | |
### ---------------------------------------------------------------- | |
### NOTE THAT 'dplyr' CAN NOT RUN THIS WITHOUT 'group_by' FIRST #### | |
### ---------------------------------------------------------------- | |
# a. runs entirely in C/C++ - hybrid evaluator... | |
DF.cp <- group_by(DF, c) | |
benchmark(summarise(DF.cp, m.b = sum(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 0.205 0.000 0.205 0 0 ### fast but 'group_by' is still 3x slower (24.97 vs 7.3 sec) | |
# [2,] 0.199 0.001 0.199 0 0 | |
# [3,] 0.198 0.000 0.202 0 0 | |
# b. evaluating the function instead (similar to how data.table does it) | |
sum__ <- sum | |
benchmark(summarise(DF.cp, m.b = sum__(b))) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.601 0.074 1.693 0 0 | |
# [2,] 1.564 0.069 1.660 0 0 | |
# [3,] 3.226 0.078 3.397 0 0 | |
# c. data.table way (with key being set) | |
setkey(DT.cp, c) | |
benchmark(DT.cp[, list(m.b=sum(b)), by=c]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 1.822 0.006 1.894 0 0 | |
# [2,] 1.817 0.005 1.846 0 0 | |
# [3,] 1.837 0.008 1.916 0 0 | |
### THIS IS NOT POSSIBLE USING 'dplyr' | |
# d. data.table way - 'cold' by - it doesn't require key being set | |
DT.cp <- copy(DT) | |
benchmark(DT.cp[, list(m.b=sum(b)), by=c]) | |
# user.self sys.self elapsed user.child sys.child | |
# [1,] 9.018 0.519 10.466 0 0 ### group_by alone takes 24.97 seconds in 'dplyr' | |
# [2,] 8.943 0.454 9.822 0 0 | |
# [3,] 8.062 0.412 8.726 0 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment