arunsrinivasan · December 30, 2015 13:59
diff --git a/dplyr_vs_data.table_1.8.11.R b/dplyr_vs_data.table_1.8.11.R
 # version 1.8.11 (commit 1048)
 require(data.table)
 # Loading required package: data.table
 # data.table 1.8.11  For help type: help("data.table")

 ## create a huge data.table:
 ## -------------------------
 set.seed(1)
 N <- 2e7 # size of DT

 # generate a character vector of length about 1e5
 foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
 ch <- replicate(1e5, foo())
 ch <- unique(ch)

 # > length(ch)
 # [1] 99982

 # DT now
 DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), 
                 b = as.numeric(sample(rnorm(1e6), N, TRUE)), 
                 c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), 
                 d = sample(ch, N, TRUE))

 DT.cp <- copy(DT)
 system.time(setkey(DT.cp, c))
 #   user  system elapsed
 #  6.945   0.196   7.312

 tables() # memory footprint
 #      NAME        NROW  MB COLS    KEY
 # [1,] DT    20,000,000 539 a,b,c,d c
 # [2,] DT.cp 20,000,000 539 a,b,c,d c
 # Total: 1,078MB

 require(dplyr) # as of 6th December
 # creating grouped_df from 'dplyr'
 DF <- tbl_df(data.frame(DT))
 system.time(DF.cp <- group_by(DF, c))
 #   user  system elapsed
 # 21.803   1.780  24.970

 # memory footprint
 print(object.size(DF), units='Mb') # 538.9 Mb
 print(object.size(DF.cp), units='Mb') # 545.8 Mb

 ## Borrowing timing function from Hadley:
 ## --------------------------------------
 benchmark <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))    
  )  
 }

 ## ----------------------------------------------------------------------------------
 ## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data)
 ## ----------------------------------------------------------------------------------

 # 1a) DF vector-scan subset
 benchmark(DF[DF$d == "ewdjgq", ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     5.592    0.276   6.015          0         0
 # [2,]     5.565    0.024   6.058          0         0
 # [3,]     5.591    0.021   6.003          0         0

 benchmark(DF[DF$c == 169073, ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     3.413    0.136   4.185          0         0
 # [2,]     3.434    0.119   3.642          0         0
 # [3,]     3.433    0.121   3.701          0         0

 # 1b) ordinary DT vector-scan subset
 benchmark(DT[d == "ewdjgq"])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     2.292    0.075   2.433          0         0
 # [2,]     2.390    0.085   2.727          0         0
 # [3,]     2.277    0.008   2.420          0         0

 benchmark(DT[c == 169073])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.178    0.113   0.295          0         0
 # [2,]     0.178    0.114   0.302          0         0
 # [3,]     0.179    0.112   0.299          0         0

 # 1c) dplyr's 'filter'
 benchmark(filter(DF, d == "ewdjgq"))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     2.474    0.010   2.612          0         0
 # [2,]     2.447    0.006   2.504          0         0
 # [3,]     2.443    0.006   2.490          0         0

 benchmark(filter(DF, c == 169073))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.316    0.121   0.504          0         0
 # [2,]     0.318    0.125   0.477          0         0
 # [3,]     0.314    0.120   0.452          0         0

 ## ------------------------------------------------------------------------------
 ## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data)
 ## ------------------------------------------------------------------------------

 # 1a) DF vector-scan subset
 benchmark(DF.cp[DF.cp$d == "ewdjgq", ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     5.557    0.025   5.914          0         0
 # [2,]     5.533    0.019   5.830          0         0
 # [3,]     5.540    0.017   5.782          0         0

 benchmark(DF.cp[DF.cp$c == 169073, ])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     3.428    0.127   3.871          0         0
 # [2,]     3.435    0.120   3.628          0         0
 # [3,]     3.440    0.124   3.658          0         0

 # 1b) ordinary DT vector-scan subset
 benchmark(DT.cp[d == "ewdjgq"])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     2.312    0.011   2.457          0         0
 # [2,]     2.273    0.007   2.324          0         0
 # [3,]     2.262    0.008   2.318          0         0

 benchmark(DT.cp[c == 169073])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.179    0.120   0.307          0         0
 # [2,]     0.178    0.111   0.294          0         0
 # [3,]     0.177    0.111   0.292          0         0

 # 1c) dplyr's 'filter'
 benchmark(filter(DF.cp, d == "ewdjgq"))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     3.515    0.040   3.752          0         0
 # [2,]     3.511    0.020   3.824          0         0
 # [3,]     3.436    0.012   3.516          0         0

 benchmark(filter(DF.cp, c == 169073))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.525    0.181   1.746          0         0
 # [2,]     1.484    0.171   1.701          0         0
 # [3,]     1.451    0.171   1.686          0         0

 # 1d) data.table's binary search
 benchmark(DT.cp[J(169073)])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.001        0   0.002          0         0
 # [2,]     0.002        0   0.002          0         0
 # [3,]     0.002        0   0.002          0         0

 # 1e) dplyr's join approach (doesn't use keys though):
 benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c")))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     3.219    0.264   3.571          0         0
 # [2,]     3.222    0.256   3.628          0         0
 # [3,]     3.225    0.253   3.576          0         0


 ## -------------------------------------------------------------
 ## 3) Comparing "arrange" (ordering) from dplyr with data.table:
 ## -------------------------------------------------------------
 benchmark(arrange(DF.cp, b,c))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]    41.046    0.473  43.312          0         0
 # [2,]    41.218    0.511  43.989          0         0
 # [3,]    39.962    0.440  40.741          0         0

 benchmark(setkey(copy(DT), b,c))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]    14.741    1.322  16.180          0         0
 # [2,]    14.364    1.135  15.631          0         0
 # [3,]    12.420    1.028  13.528          0         0

 ## -------------------------------------------------
 ## 4) Comparing "mutate" from dplyr with data.table:
 ## -------------------------------------------------

 # The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object,
 # where as ':=' (or 'set') modifies the same object by reference.

 # to make the comparison fair, I'll use 'set' on a copy' everytime.
 benchmark(mutate(DF.cp, e=a+b))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.456    0.294   1.761          0         0
 # [2,]     1.424    0.290   1.723          0         0
 # [3,]     1.413    0.281   1.696          0         0

 # run this 3 times
 setkey(DT.cp <- copy(DT), c)
 system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b))
 #   user  system elapsed 
 #  0.229   0.201   0.429 
 #  0.235   0.203   0.446 
 #  0.237   0.219   0.466 

 ## --------------------------------------------------------------------------
 ## 5) Comparing "join" from dplyr with data.table: (on character column here)
 ## --------------------------------------------------------------------------
 DF.cp <- group_by(DF, d)
 setkey(DT.cp <- copy(DT), d)

 set.seed(1)
 DF.j <- data.frame(d = sample(ch, 1e3, FALSE), stringsAsFactors=FALSE)
 DT.j <- data.table(DF.j) # no key on DT.j

 benchmark(left_join(DF.j, DF.cp, by="d"))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.616    0.085   1.703          0         0
 # [2,]     1.604    0.077   1.685          0         0
 # [3,]     1.608    0.073   1.692          0         0

 benchmark(DT.cp[DT.j])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.017        0   0.018          0         0
 # [2,]     0.018        0   0.019          0         0
 # [3,]     0.017        0   0.018          0         0

 ## ----------------------------------------------------------------------------------
 ## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups
 ## ----------------------------------------------------------------------------------

 # with the groupings on character col. "d"
 # 6a. with C-run function of dplyr
 benchmark(summarise(DF.cp, m.b = sum(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.200    0.001   0.200          0         0
 # [2,]     0.199    0.002   0.292          0         0
 # [3,]     0.200    0.000   0.200          0         0

 # 6b. evaluating the function instead
 sum__ <- sum
 benchmark(summarise(DF.cp, m.b = sum__(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.361    0.100   0.463          0         0
 # [2,]     0.352    0.078   0.431          0         0
 # [3,]     0.354    0.065   0.421          0         0

 # 6c. data.table way
 benchmark(DT.cp[, list(m.b=sum(b)), by=d])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.674    0.119   0.807          0         0
 # [2,]     0.663    0.116   0.784          0         0
 # [3,]     0.667    0.120   0.796          0         0

 ## ----------------------------------------------------------------------------------
 ## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups
 ## ----------------------------------------------------------------------------------

 # with the groupings on character col. "d"
 # 7a. with C-run function of dplyr
 DF.cp <- group_by(DF, c)
 benchmark(summarise(DF.cp, m.b = sum(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     0.205    0.000   0.205          0         0
 # [2,]     0.199    0.001   0.199          0         0
 # [3,]     0.198    0.000   0.202          0         0

 # 7b. evaluating the function instead
 sum__ <- sum
 benchmark(summarise(DF.cp, m.b = sum__(b)))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.601    0.074   1.693          0         0
 # [2,]     1.564    0.069   1.660          0         0
 # [3,]     3.226    0.078   3.397          0         0

 # 7c. data.table way
 setkey(DT.cp, c)
 benchmark(DT.cp[, list(m.b=sum(b)), by=c])
 #      user.self sys.self elapsed user.child sys.child
 # [1,]     1.822    0.006   1.894          0         0
 # [2,]     1.817    0.005   1.846          0         0
 # [3,]     1.837    0.008   1.916          0         0
	# version 1.8.11 (commit 1048)
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	## create a huge data.table:
	## -------------------------
	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e5, foo())
	ch <- unique(ch)

	# > length(ch)
	# [1] 99982

	# DT now
	DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
	b = as.numeric(sample(rnorm(1e6), N, TRUE)),
	c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
	d = sample(ch, N, TRUE))

	DT.cp <- copy(DT)
	system.time(setkey(DT.cp, c))
	# user system elapsed
	# 6.945 0.196 7.312

	tables() # memory footprint
	# NAME NROW MB COLS KEY
	# [1,] DT 20,000,000 539 a,b,c,d c
	# [2,] DT.cp 20,000,000 539 a,b,c,d c
	# Total: 1,078MB

	require(dplyr) # as of 6th December
	# creating grouped_df from 'dplyr'
	DF <- tbl_df(data.frame(DT))
	system.time(DF.cp <- group_by(DF, c))
	# user system elapsed
	# 21.803 1.780 24.970

	# memory footprint
	print(object.size(DF), units='Mb') # 538.9 Mb
	print(object.size(DF.cp), units='Mb') # 545.8 Mb

	## Borrowing timing function from Hadley:
	## --------------------------------------
	benchmark <- function(code) {
	code <- substitute(code)

	rbind(
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame()))
	)
	}

	## ----------------------------------------------------------------------------------
	## 1) Comparing "filter" from dplyr with data.table: (on unkey'd / ungrouped data)
	## ----------------------------------------------------------------------------------

	# 1a) DF vector-scan subset
	benchmark(DF[DF$d == "ewdjgq", ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 5.592 0.276 6.015 0 0
	# [2,] 5.565 0.024 6.058 0 0
	# [3,] 5.591 0.021 6.003 0 0

	benchmark(DF[DF$c == 169073, ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 3.413 0.136 4.185 0 0
	# [2,] 3.434 0.119 3.642 0 0
	# [3,] 3.433 0.121 3.701 0 0

	# 1b) ordinary DT vector-scan subset
	benchmark(DT[d == "ewdjgq"])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 2.292 0.075 2.433 0 0
	# [2,] 2.390 0.085 2.727 0 0
	# [3,] 2.277 0.008 2.420 0 0

	benchmark(DT[c == 169073])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.178 0.113 0.295 0 0
	# [2,] 0.178 0.114 0.302 0 0
	# [3,] 0.179 0.112 0.299 0 0

	# 1c) dplyr's 'filter'
	benchmark(filter(DF, d == "ewdjgq"))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 2.474 0.010 2.612 0 0
	# [2,] 2.447 0.006 2.504 0 0
	# [3,] 2.443 0.006 2.490 0 0

	benchmark(filter(DF, c == 169073))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.316 0.121 0.504 0 0
	# [2,] 0.318 0.125 0.477 0 0
	# [3,] 0.314 0.120 0.452 0 0

	## ------------------------------------------------------------------------------
	## 2) Comparing "filter" from dplyr with data.table: (on key'd / grouped data)
	## ------------------------------------------------------------------------------

	# 1a) DF vector-scan subset
	benchmark(DF.cp[DF.cp$d == "ewdjgq", ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 5.557 0.025 5.914 0 0
	# [2,] 5.533 0.019 5.830 0 0
	# [3,] 5.540 0.017 5.782 0 0

	benchmark(DF.cp[DF.cp$c == 169073, ])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 3.428 0.127 3.871 0 0
	# [2,] 3.435 0.120 3.628 0 0
	# [3,] 3.440 0.124 3.658 0 0

	# 1b) ordinary DT vector-scan subset
	benchmark(DT.cp[d == "ewdjgq"])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 2.312 0.011 2.457 0 0
	# [2,] 2.273 0.007 2.324 0 0
	# [3,] 2.262 0.008 2.318 0 0

	benchmark(DT.cp[c == 169073])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.179 0.120 0.307 0 0
	# [2,] 0.178 0.111 0.294 0 0
	# [3,] 0.177 0.111 0.292 0 0

	# 1c) dplyr's 'filter'
	benchmark(filter(DF.cp, d == "ewdjgq"))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 3.515 0.040 3.752 0 0
	# [2,] 3.511 0.020 3.824 0 0
	# [3,] 3.436 0.012 3.516 0 0

	benchmark(filter(DF.cp, c == 169073))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.525 0.181 1.746 0 0
	# [2,] 1.484 0.171 1.701 0 0
	# [3,] 1.451 0.171 1.686 0 0

	# 1d) data.table's binary search
	benchmark(DT.cp[J(169073)])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.001 0 0.002 0 0
	# [2,] 0.002 0 0.002 0 0
	# [3,] 0.002 0 0.002 0 0

	# 1e) dplyr's join approach (doesn't use keys though):
	benchmark(inner_join(DF.cp, data.frame(c = 169073L), by = c("c")))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 3.219 0.264 3.571 0 0
	# [2,] 3.222 0.256 3.628 0 0
	# [3,] 3.225 0.253 3.576 0 0


	## -------------------------------------------------------------
	## 3) Comparing "arrange" (ordering) from dplyr with data.table:
	## -------------------------------------------------------------
	benchmark(arrange(DF.cp, b,c))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 41.046 0.473 43.312 0 0
	# [2,] 41.218 0.511 43.989 0 0
	# [3,] 39.962 0.440 40.741 0 0

	benchmark(setkey(copy(DT), b,c))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 14.741 1.322 16.180 0 0
	# [2,] 14.364 1.135 15.631 0 0
	# [3,] 12.420 1.028 13.528 0 0

	## -------------------------------------------------
	## 4) Comparing "mutate" from dplyr with data.table:
	## -------------------------------------------------

	# The logical equivalent of 'mutate' is ':=' or 'set' to me... 'mutate' seems to create a NAM(2) object,
	# where as ':=' (or 'set') modifies the same object by reference.

	# to make the comparison fair, I'll use 'set' on a copy' everytime.
	benchmark(mutate(DF.cp, e=a+b))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.456 0.294 1.761 0 0
	# [2,] 1.424 0.290 1.723 0 0
	# [3,] 1.413 0.281 1.696 0 0

	# run this 3 times
	setkey(DT.cp <- copy(DT), c)
	system.time(set(DT.cp, i=NULL, j="e", value=DT.cp$a+DT.cp$b))
	# user system elapsed
	# 0.229 0.201 0.429
	# 0.235 0.203 0.446
	# 0.237 0.219 0.466

	## --------------------------------------------------------------------------
	## 5) Comparing "join" from dplyr with data.table: (on character column here)
	## --------------------------------------------------------------------------
	DF.cp <- group_by(DF, d)
	setkey(DT.cp <- copy(DT), d)

	set.seed(1)
	DF.j <- data.frame(d = sample(ch, 1e3, FALSE), stringsAsFactors=FALSE)
	DT.j <- data.table(DF.j) # no key on DT.j

	benchmark(left_join(DF.j, DF.cp, by="d"))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.616 0.085 1.703 0 0
	# [2,] 1.604 0.077 1.685 0 0
	# [3,] 1.608 0.073 1.692 0 0

	benchmark(DT.cp[DT.j])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.017 0 0.018 0 0
	# [2,] 0.018 0 0.019 0 0
	# [3,] 0.017 0 0.018 0 0

	## ----------------------------------------------------------------------------------
	## 6) Comparing "summarise" from dplyr with data.table: (grouped by character column) - ~9e4 unique groups
	## ----------------------------------------------------------------------------------

	# with the groupings on character col. "d"
	# 6a. with C-run function of dplyr
	benchmark(summarise(DF.cp, m.b = sum(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.200 0.001 0.200 0 0
	# [2,] 0.199 0.002 0.292 0 0
	# [3,] 0.200 0.000 0.200 0 0

	# 6b. evaluating the function instead
	sum__ <- sum
	benchmark(summarise(DF.cp, m.b = sum__(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.361 0.100 0.463 0 0
	# [2,] 0.352 0.078 0.431 0 0
	# [3,] 0.354 0.065 0.421 0 0

	# 6c. data.table way
	benchmark(DT.cp[, list(m.b=sum(b)), by=d])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.674 0.119 0.807 0 0
	# [2,] 0.663 0.116 0.784 0 0
	# [3,] 0.667 0.120 0.796 0 0

	## ----------------------------------------------------------------------------------
	## 7) Comparing "summarise" from dplyr with data.table: (grouped by integer column) - ~9e5 unique groups
	## ----------------------------------------------------------------------------------

	# with the groupings on character col. "d"
	# 7a. with C-run function of dplyr
	DF.cp <- group_by(DF, c)
	benchmark(summarise(DF.cp, m.b = sum(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 0.205 0.000 0.205 0 0
	# [2,] 0.199 0.001 0.199 0 0
	# [3,] 0.198 0.000 0.202 0 0

	# 7b. evaluating the function instead
	sum__ <- sum
	benchmark(summarise(DF.cp, m.b = sum__(b)))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.601 0.074 1.693 0 0
	# [2,] 1.564 0.069 1.660 0 0
	# [3,] 3.226 0.078 3.397 0 0

	# 7c. data.table way
	setkey(DT.cp, c)
	benchmark(DT.cp[, list(m.b=sum(b)), by=c])
	# user.self sys.self elapsed user.child sys.child
	# [1,] 1.822 0.006 1.894 0 0
	# [2,] 1.817 0.005 1.846 0 0
	# [3,] 1.837 0.008 1.916 0 0