arunsrinivasan · December 7, 2013 17:44
diff --git a/dplyr_vs_data.table_1.8.11_less_groupings.R b/dplyr_vs_data.table_1.8.11_less_groupings.R
 # version 1.8.11 (commit 1048)
 require(data.table)
 # Loading required package: data.table
 # data.table 1.8.11  For help type: help("data.table")

 ## create a huge data.table:
 ## -------------------------
 set.seed(1)
 N <- 2e7 # size of DT

 # generate a character vector of length about 1e5
 foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
 ch <- replicate(1e3, foo())
 ch <- unique(ch)

 # > length(ch)
 # [1] 1000

 # DT now
 DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e3)*1e6), N, TRUE)), 
                 b = as.numeric(sample(rnorm(1e4), N, TRUE)), 
                 c = sample(c(-1000:1000), N, TRUE), 
                 d = sample(ch, N, TRUE))

 DT.cp <- copy(DT)
 system.time(setkey(DT.cp, c))
 #   user  system elapsed 
 #  5.931   0.276   6.340 
  
 tables() # memory footprint
 #      NAME        NROW  MB COLS    KEY
 # [1,] DT    20,000,000 535 a,b,c,d c
 # [2,] DT.cp 20,000,000 535 a,b,c,d c
 # Total: 1,070MB

 require(dplyr) # as of 6th December
 # creating grouped_df from 'dplyr'
 DF <- tbl_df(data.frame(DT))
 system.time(DF.cp <- group_by(DF, c))
 #   user  system elapsed 
 #  5.166   1.171   6.394 
  
 # memory footprint
 print(object.size(DF), units='Mb') # 534.1 Mb
 print(object.size(DF.cp), units='Mb') # 534.1 Mb

 ## Borrowing timing function from Hadley:
 ## --------------------------------------
 benchmark <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))    
  )  
 }

 ## ----------------------------------------------------------------------------------
 ## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data)
 ## ----------------------------------------------------------------------------------

 # 1a) DF vector-scan subset
 benchmark(DF[DF$d == "ewdjgq", ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     4.097    0.358   4.531          0         0
 # [2,]     3.970    0.008   4.053          0         0
 # [3,]     3.959    0.004   3.980          0         0

 benchmark(DF[DF$c == 169073, ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     3.398    0.110   3.546          0         0
 # [2,]     3.369    0.106   3.489          0         0
 # [3,]     3.377    0.110   3.557          0         0

 # 1b) ordinary DT vector-scan subset
 benchmark(DT[d == "ewdjgq"])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.749    0.003   0.775          0         0
 # [2,]     0.746    0.002   0.756          0         0
 # [3,]     0.748    0.002   0.763          0         0

 benchmark(DT[c == 169073])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.169    0.104   0.274          0         0
 # [2,]     0.171    0.105   0.277          0         0
 # [3,]     0.172    0.108   0.282          0         0

 # 1c) dplyr's 'filter'
 benchmark(filter(DF, d == "ewdjgq"))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.845    0.002   0.913          0         0
 # [2,]     0.840    0.001   0.847          0         0
 # [3,]     0.843    0.001   0.847          0         0
 
 benchmark(filter(DF, c == 169073))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.268    0.111   0.379          0         0
 # [2,]     0.266    0.111   0.377          0         0
 # [3,]     0.268    0.106   0.374          0         0

 ## ------------------------------------------------------------------------------
 ## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data)
 ## ------------------------------------------------------------------------------

 # 1a) DF vector-scan subset
 benchmark(DF.cp[DF.cp$d == "ewdjgq", ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     4.002    0.015   4.481          0         0
 # [2,]     3.941    0.005   3.974          0         0
 # [3,]     3.959    0.004   3.993          0         0

 benchmark(DF.cp[DF.cp$c == 169073, ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     3.420    0.120   3.690          0         0
 # [2,]     3.415    0.121   3.696          0         0
 # [3,]     3.429    0.119   3.723          0         0

 # 1b) ordinary DT vector-scan subset
 benchmark(DT.cp[d == "ewdjgq"])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.746    0.003   0.779          0         0
 # [2,]     0.744    0.003   0.825          0         0
 # [3,]     0.744    0.004   1.011          0         0

 benchmark(DT.cp[c == 169073])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.171    0.106   0.279          0         0
 # [2,]     0.173    0.107   0.280          0         0
 # [3,]     0.169    0.104   0.274          0         0

 # 1c) dplyr's 'filter'
 benchmark(filter(DF.cp, d == "ewdjgq"))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.943    0.050   1.017          0         0
 # [2,]     0.943    0.049   1.002          0         0
 # [3,]     0.942    0.046   1.054          0         0

 benchmark(filter(DF.cp, c == 169073))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.351    0.157   0.510          0         0
 # [2,]     0.347    0.150   0.497          0         0
 # [3,]     0.350    0.144   0.504          0         0

 # 1d) data.table's binary search
 benchmark(DT.cp[J(169073)])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.003        0   0.003          0         0
 # [2,]     0.002        0   0.002          0         0
 # [3,]     0.002        0   0.002          0         0

 # 1e) dplyr's join approach (doesn't use keys though):
 benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c")))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.054    0.095   1.200          0         0
 # [2,]     1.038    0.091   1.206          0         0
 # [3,]     1.032    0.087   1.129          0         0


 ## -------------------------------------------------------------
 ## 3) Comparing "arrange" (ordering) from dplyr with data.table:
 ## -------------------------------------------------------------
 benchmark(arrange(DF.cp, b,c))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]    41.444    0.503  43.481          0         0
 # [2,]    40.557    0.458  41.569          0         0
 # [3,]    40.066    0.433  40.995          0         0

 benchmark(setkey(copy(DT), b,c))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]    11.047    1.452  12.616          0         0
 # [2,]    11.032    0.886  12.154          0         0
 # [3,]    11.490    0.953  14.389          0         0

 ## -------------------------------------------------
 ## 4) Comparing "mutate" from dplyr with data.table:
 ## -------------------------------------------------

 # The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object,
 # where as ':=' (or 'set') modifies the same object by reference.

 # to make the comparison fair, I'll use 'set' on a copy' everytime.
 benchmark(mutate(DF.cp, e=a+b))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.218    0.195   0.416          0         0
 # [2,]     0.216    0.193   0.411          0         0
 # [3,]     0.222    0.189   0.419          0         0

 # run this 3 times
 setkey(DT.cp <- copy(DT), c)
 system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b))
 #   user  system elapsed 
 #  0.234   0.212   0.448 
 #  0.235   0.205   0.450 
 #  0.239   0.212   0.461 

 ## --------------------------------------------------------------------------
 ## 5) Comparing "join" from dplyr with data.table: (on character column here)
 ## --------------------------------------------------------------------------
 DF.cp <- group_by(DF, d)
 setkey(DT.cp <- copy(DT), d)

 set.seed(1)
 DF.j <- data.frame(d = sample(ch, 1e2, FALSE), stringsAsFactors=FALSE)
 DT.j <- data.table(DF.j) # no key on DT.j

 benchmark(left_join(DF.j, DF.cp, by="d"))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.359    0.197   1.571          0         0
 # [2,]     1.371    0.164   1.613          0         0
 # [3,]     1.389    0.174   1.720          0         0

 benchmark(DT.cp[DT.j])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.139    0.013   0.153          0         0
 # [2,]     0.139    0.004   0.151          0         0
 # [3,]     0.140    0.004   0.147          0         0

 ## ----------------------------------------------------------------------------------
 ## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups
 ## ----------------------------------------------------------------------------------

 # with the groupings on character col. "d"
 # 6a. with C-run function of dplyr
 benchmark(summarise(DF.cp, m.b = sum(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.202    0.000   0.203          0         0
 # [2,]     0.197    0.000   0.199          0         0
 # [3,]     0.200    0.001   0.210          0         0

 # 6b. evaluating the function instead
 sum__ <- sum
 benchmark(summarise(DF.cp, m.b = sum__(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.202    0.122   0.324          0         0
 # [2,]     0.207    0.129   0.353          0         0
 # [3,]     0.207    0.127   0.340          0         0

 # 6c. data.table way
 benchmark(DT.cp[, list(m.b=sum(b)), by=d])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.460    0.112   0.575          0         0
 # [2,]     0.454    0.117   0.572          0         0
 # [3,]     0.451    0.117   0.568          0         0

 ## ----------------------------------------------------------------------------------
 ## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups
 ## ----------------------------------------------------------------------------------

 # with the groupings on character col. "d"
 # 7a. with C-run function of dplyr
 DF.cp <- group_by(DF, c)
 benchmark(summarise(DF.cp, m.b = sum(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.199    0.000   0.204          0         0
 # [2,]     0.197    0.000   0.198          0         0
 # [3,]     0.200    0.001   0.200          0         0

 # 7b. evaluating the function instead
 sum__ <- sum
 benchmark(summarise(DF.cp, m.b = sum__(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.203    0.095   0.298          0         0
 # [2,]     0.205    0.097   0.303          0         0
 # [3,]     0.206    0.089   0.297          0         0

 # 7c. data.table way
 setkey(DT.cp, c)
 benchmark(DT.cp[, list(m.b=sum(b)), by=c])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.424    0.002   0.443          0         0
 # [2,]     0.415    0.000   0.427          0         0
 # [3,]     0.413    0.000   0.415          0         0
	# version 1.8.11 (commit 1048)
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	## create a huge data.table:
	## -------------------------
	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e3, foo())
	ch <- unique(ch)

	# > length(ch)
	# [1] 1000

	# DT now
	DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e3)*1e6), N, TRUE)),
	b = as.numeric(sample(rnorm(1e4), N, TRUE)),
	c = sample(c(-1000:1000), N, TRUE),
	d = sample(ch, N, TRUE))

	DT.cp <- copy(DT)
	system.time(setkey(DT.cp, c))
	# user system elapsed
	# 5.931 0.276 6.340

	tables() # memory footprint
	# NAME NROW MB COLS KEY
	# [1,] DT 20,000,000 535 a,b,c,d c
	# [2,] DT.cp 20,000,000 535 a,b,c,d c
	# Total: 1,070MB

	require(dplyr) # as of 6th December
	# creating grouped_df from 'dplyr'
	DF <- tbl_df(data.frame(DT))
	system.time(DF.cp <- group_by(DF, c))
	# user system elapsed
	# 5.166 1.171 6.394

	# memory footprint
	print(object.size(DF), units='Mb') # 534.1 Mb
	print(object.size(DF.cp), units='Mb') # 534.1 Mb

	## Borrowing timing function from Hadley:
	## --------------------------------------
	benchmark <- function(code) {
	code <- substitute(code)

	rbind(
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame()))
	)
	}

	## ----------------------------------------------------------------------------------
	## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data)
	## ----------------------------------------------------------------------------------

	# 1a) DF vector-scan subset
	benchmark(DF[DF$d == "ewdjgq", ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 4.097 0.358 4.531 0 0
	# [2,] 3.970 0.008 4.053 0 0
	# [3,] 3.959 0.004 3.980 0 0

	benchmark(DF[DF$c == 169073, ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 3.398 0.110 3.546 0 0
	# [2,] 3.369 0.106 3.489 0 0
	# [3,] 3.377 0.110 3.557 0 0

	# 1b) ordinary DT vector-scan subset
	benchmark(DT[d == "ewdjgq"])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.749 0.003 0.775 0 0
	# [2,] 0.746 0.002 0.756 0 0
	# [3,] 0.748 0.002 0.763 0 0

	benchmark(DT[c == 169073])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.169 0.104 0.274 0 0
	# [2,] 0.171 0.105 0.277 0 0
	# [3,] 0.172 0.108 0.282 0 0

	# 1c) dplyr's 'filter'
	benchmark(filter(DF, d == "ewdjgq"))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.845 0.002 0.913 0 0
	# [2,] 0.840 0.001 0.847 0 0
	# [3,] 0.843 0.001 0.847 0 0

	benchmark(filter(DF, c == 169073))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.268 0.111 0.379 0 0
	# [2,] 0.266 0.111 0.377 0 0
	# [3,] 0.268 0.106 0.374 0 0

	## ------------------------------------------------------------------------------
	## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data)
	## ------------------------------------------------------------------------------

	# 1a) DF vector-scan subset
	benchmark(DF.cp[DF.cp$d == "ewdjgq", ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 4.002 0.015 4.481 0 0
	# [2,] 3.941 0.005 3.974 0 0
	# [3,] 3.959 0.004 3.993 0 0

	benchmark(DF.cp[DF.cp$c == 169073, ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 3.420 0.120 3.690 0 0
	# [2,] 3.415 0.121 3.696 0 0
	# [3,] 3.429 0.119 3.723 0 0

	# 1b) ordinary DT vector-scan subset
	benchmark(DT.cp[d == "ewdjgq"])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.746 0.003 0.779 0 0
	# [2,] 0.744 0.003 0.825 0 0
	# [3,] 0.744 0.004 1.011 0 0

	benchmark(DT.cp[c == 169073])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.171 0.106 0.279 0 0
	# [2,] 0.173 0.107 0.280 0 0
	# [3,] 0.169 0.104 0.274 0 0

	# 1c) dplyr's 'filter'
	benchmark(filter(DF.cp, d == "ewdjgq"))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.943 0.050 1.017 0 0
	# [2,] 0.943 0.049 1.002 0 0
	# [3,] 0.942 0.046 1.054 0 0

	benchmark(filter(DF.cp, c == 169073))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.351 0.157 0.510 0 0
	# [2,] 0.347 0.150 0.497 0 0
	# [3,] 0.350 0.144 0.504 0 0

	# 1d) data.table's binary search
	benchmark(DT.cp[J(169073)])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.003 0 0.003 0 0
	# [2,] 0.002 0 0.002 0 0
	# [3,] 0.002 0 0.002 0 0

	# 1e) dplyr's join approach (doesn't use keys though):
	benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c")))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.054 0.095 1.200 0 0
	# [2,] 1.038 0.091 1.206 0 0
	# [3,] 1.032 0.087 1.129 0 0


	## -------------------------------------------------------------
	## 3) Comparing "arrange" (ordering) from dplyr with data.table:
	## -------------------------------------------------------------
	benchmark(arrange(DF.cp, b,c))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 41.444 0.503 43.481 0 0
	# [2,] 40.557 0.458 41.569 0 0
	# [3,] 40.066 0.433 40.995 0 0

	benchmark(setkey(copy(DT), b,c))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 11.047 1.452 12.616 0 0
	# [2,] 11.032 0.886 12.154 0 0
	# [3,] 11.490 0.953 14.389 0 0

	## -------------------------------------------------
	## 4) Comparing "mutate" from dplyr with data.table:
	## -------------------------------------------------

	# The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object,
	# where as ':=' (or 'set') modifies the same object by reference.

	# to make the comparison fair, I'll use 'set' on a copy' everytime.
	benchmark(mutate(DF.cp, e=a+b))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.218 0.195 0.416 0 0
	# [2,] 0.216 0.193 0.411 0 0
	# [3,] 0.222 0.189 0.419 0 0

	# run this 3 times
	setkey(DT.cp <- copy(DT), c)
	system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b))
	# user system elapsed
	# 0.234 0.212 0.448
	# 0.235 0.205 0.450
	# 0.239 0.212 0.461

	## --------------------------------------------------------------------------
	## 5) Comparing "join" from dplyr with data.table: (on character column here)
	## --------------------------------------------------------------------------
	DF.cp <- group_by(DF, d)
	setkey(DT.cp <- copy(DT), d)

	set.seed(1)
	DF.j <- data.frame(d = sample(ch, 1e2, FALSE), stringsAsFactors=FALSE)
	DT.j <- data.table(DF.j) # no key on DT.j

	benchmark(left_join(DF.j, DF.cp, by="d"))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.359 0.197 1.571 0 0
	# [2,] 1.371 0.164 1.613 0 0
	# [3,] 1.389 0.174 1.720 0 0

	benchmark(DT.cp[DT.j])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.139 0.013 0.153 0 0
	# [2,] 0.139 0.004 0.151 0 0
	# [3,] 0.140 0.004 0.147 0 0

	## ----------------------------------------------------------------------------------
	## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups
	## ----------------------------------------------------------------------------------

	# with the groupings on character col. "d"
	# 6a. with C-run function of dplyr
	benchmark(summarise(DF.cp, m.b = sum(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.202 0.000 0.203 0 0
	# [2,] 0.197 0.000 0.199 0 0
	# [3,] 0.200 0.001 0.210 0 0

	# 6b. evaluating the function instead
	sum__ <- sum
	benchmark(summarise(DF.cp, m.b = sum__(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.202 0.122 0.324 0 0
	# [2,] 0.207 0.129 0.353 0 0
	# [3,] 0.207 0.127 0.340 0 0

	# 6c. data.table way
	benchmark(DT.cp[, list(m.b=sum(b)), by=d])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.460 0.112 0.575 0 0
	# [2,] 0.454 0.117 0.572 0 0
	# [3,] 0.451 0.117 0.568 0 0

	## ----------------------------------------------------------------------------------
	## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups
	## ----------------------------------------------------------------------------------

	# with the groupings on character col. "d"
	# 7a. with C-run function of dplyr
	DF.cp <- group_by(DF, c)
	benchmark(summarise(DF.cp, m.b = sum(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.199 0.000 0.204 0 0
	# [2,] 0.197 0.000 0.198 0 0
	# [3,] 0.200 0.001 0.200 0 0

	# 7b. evaluating the function instead
	sum__ <- sum
	benchmark(summarise(DF.cp, m.b = sum__(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.203 0.095 0.298 0 0
	# [2,] 0.205 0.097 0.303 0 0
	# [3,] 0.206 0.089 0.297 0 0

	# 7c. data.table way
	setkey(DT.cp, c)
	benchmark(DT.cp[, list(m.b=sum(b)), by=c])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.424 0.002 0.443 0 0
	# [2,] 0.415 0.000 0.427 0 0
	# [3,] 0.413 0.000 0.415 0 0