arunsrinivasan · December 17, 2013 00:03
diff --git a/dplyr_data.table_mini_benchmark.R b/dplyr_data.table_mini_benchmark.R
 # version 1.8.11 
 require(data.table)
 # Loading required package: data.table
 # data.table 1.8.11  For help type: help("data.table")

 ## create a huge data.table:
 ## -------------------------
 set.seed(1)
 N <- 2e7 # size of DT

 # generate a character vector of length about 1e5
 foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
 ch <- replicate(1e5, foo())
 ch <- unique(ch)

 # > length(ch)
 # [1] 99982

 # DT now
 DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), 
                 b = as.numeric(sample(rnorm(1e6), N, TRUE)), 
                 c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), 
                 d = sample(ch, N, TRUE))

 ## setkey on data.table v 1.8.11
 ## ----------------------------
 DT.cp <- copy(DT)
 system.time(setkey(DT.cp, c))
 #   user  system elapsed
 #  6.945   0.196   7.312

 ## equivalent of setkey in dplyr (group_by) - as of december 6th
 ## ------------------------------------------------------------
 require(dplyr)
 # creating grouped_df from 'dplyr'
 DF <- tbl_df(data.frame(DT))
 system.time(DF.cp <- group_by(DF, c))
 #   user  system elapsed
 # 21.803   1.780  24.970

 ## setkey on data.table takes 7.3 seconds where as group_by on dplyr takes 24.97 seconds!
 ## dplyr requires group_by to be able to "summarise" data. Benchmarks for summarise are shown below.

 ## Borrowing timing function from Hadley:
 ## --------------------------------------
 benchmark <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))    
  )  
 }

 ## -------------------------------------------------
 ## Comparing "summarise" from dplyr with data.table: - ~9e5 unique groups
 ## -------------------------------------------------

 ### ----------------------------------------------------------------
 ### NOTE THAT 'dplyr' CAN NOT RUN THIS WITHOUT 'group_by' FIRST ####
 ### ----------------------------------------------------------------

 # a. runs entirely in C/C++ - hybrid evaluator...
 DF.cp <- group_by(DF, c)
 benchmark(summarise(DF.cp, m.b = sum(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.205    0.000   0.205          0         0  ### fast but 'group_by' is still 3x slower (24.97 vs 7.3 sec)
 # [2,]     0.199    0.001   0.199          0         0
 # [3,]     0.198    0.000   0.202          0         0

 # b. evaluating the function instead (similar to how data.table does it)
 sum__ <- sum
 benchmark(summarise(DF.cp, m.b = sum__(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.601    0.074   1.693          0         0
 # [2,]     1.564    0.069   1.660          0         0
 # [3,]     3.226    0.078   3.397          0         0

 # c. data.table way (with key being set)
 setkey(DT.cp, c)
 benchmark(DT.cp[, list(m.b=sum(b)), by=c])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.822    0.006   1.894          0         0
 # [2,]     1.817    0.005   1.846          0         0
 # [3,]     1.837    0.008   1.916          0         0


 ### THIS IS NOT POSSIBLE USING 'dplyr'
 # d. data.table way - 'cold' by - it doesn't require key being set
 DT.cp <- copy(DT)
 benchmark(DT.cp[, list(m.b=sum(b)), by=c])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     9.018    0.519  10.466          0         0     ### group_by alone takes 24.97 seconds in 'dplyr'
 # [2,]     8.943    0.454   9.822          0         0
 # [3,]     8.062    0.412   8.726          0         0
	# version 1.8.11
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	## create a huge data.table:
	## -------------------------
	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e5, foo())
	ch <- unique(ch)

	# > length(ch)
	# [1] 99982

	# DT now
	DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
	b = as.numeric(sample(rnorm(1e6), N, TRUE)),
	c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
	d = sample(ch, N, TRUE))

	## setkey on data.table v 1.8.11
	## ----------------------------
	DT.cp <- copy(DT)
	system.time(setkey(DT.cp, c))
	# user system elapsed
	# 6.945 0.196 7.312

	## equivalent of setkey in dplyr (group_by) - as of december 6th
	## ------------------------------------------------------------
	require(dplyr)
	# creating grouped_df from 'dplyr'
	DF <- tbl_df(data.frame(DT))
	system.time(DF.cp <- group_by(DF, c))
	# user system elapsed
	# 21.803 1.780 24.970

	## setkey on data.table takes 7.3 seconds where as group_by on dplyr takes 24.97 seconds!
	## dplyr requires group_by to be able to "summarise" data. Benchmarks for summarise are shown below.

	## Borrowing timing function from Hadley:
	## --------------------------------------
	benchmark <- function(code) {
	code <- substitute(code)

	rbind(
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame()))
	)
	}

	## -------------------------------------------------
	## Comparing "summarise" from dplyr with data.table: - ~9e5 unique groups
	## -------------------------------------------------

	### ----------------------------------------------------------------
	### NOTE THAT 'dplyr' CAN NOT RUN THIS WITHOUT 'group_by' FIRST ####
	### ----------------------------------------------------------------

	# a. runs entirely in C/C++ - hybrid evaluator...
	DF.cp <- group_by(DF, c)
	benchmark(summarise(DF.cp, m.b = sum(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.205 0.000 0.205 0 0 ### fast but 'group_by' is still 3x slower (24.97 vs 7.3 sec)
	# [2,] 0.199 0.001 0.199 0 0
	# [3,] 0.198 0.000 0.202 0 0

	# b. evaluating the function instead (similar to how data.table does it)
	sum__ <- sum
	benchmark(summarise(DF.cp, m.b = sum__(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.601 0.074 1.693 0 0
	# [2,] 1.564 0.069 1.660 0 0
	# [3,] 3.226 0.078 3.397 0 0

	# c. data.table way (with key being set)
	setkey(DT.cp, c)
	benchmark(DT.cp[, list(m.b=sum(b)), by=c])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.822 0.006 1.894 0 0
	# [2,] 1.817 0.005 1.846 0 0
	# [3,] 1.837 0.008 1.916 0 0


	### THIS IS NOT POSSIBLE USING 'dplyr'
	# d. data.table way - 'cold' by - it doesn't require key being set
	DT.cp <- copy(DT)
	benchmark(DT.cp[, list(m.b=sum(b)), by=c])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 9.018 0.519 10.466 0 0 ### group_by alone takes 24.97 seconds in 'dplyr'
	# [2,] 8.943 0.454 9.822 0 0
	# [3,] 8.062 0.412 8.726 0 0