arunsrinivasan · December 30, 2015 14:09 · mattdowle · Dec 9, 2013
diff --git a/CologneR.R b/CologneR.R
 require(reshape2)

 # data.table commit (1048)
 require(data.table)
 # Loading required package: data.table
 # data.table 1.8.11  For help type: help("data.table")

 set.seed(1)
 N <- 2e7 # size of DT

 # generate a character vector of length about 1e5
 foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
 ch <- replicate(1e5, foo())
 ch <- unique(ch)

 # > length(ch)
 # [1] 99982

 # DT now
 DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)), 
                 b = as.numeric(sample(rnorm(1e6), N, TRUE)), 
                 c = sample(c(NA_integer_, 1e5:1e6), N, TRUE), 
                 d = sample(ch, N, TRUE))

 tables()
 #      NAME       NROW  MB COLS    KEY
 # [1,] DT   20,000,000 539 a,b,c,d    
 # Total: 539MB

 # timing (run 3 times) - function borrowed from Hadley.
 benchmark <- function(code) {
  code <- substitute(code)

  rbind(
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame())),
    system.time(eval(code, parent.frame()))    
  )  
 }

 # r-session memory usage - 1GB

 ## MELT
 ## ----

 benchmark(melt(DT, id="d", measure=1:2))
     user.self sys.self elapsed user.child sys.child
 [1,]     2.885    0.595   3.554          0         0
 [2,]     1.898    0.579   2.516          0         0
 [3,]     1.894    0.562   2.492          0         0

 # compare against reshape2
 benchmark(reshape2:::melt.data.frame(DT, id="d", measure=1:2))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]   227.841   11.725 304.533          0         0
 # [2,]   166.293   10.032 190.056          0         0
 # [3,]   170.237   10.364 195.621          0         0

 ######################################################################
 # max memory used (mem footprint) - DT (1.7GB) vs reshape2 (4.7GB)
 # Speedup of ~ 75x!!!!
 ######################################################################

 ## CASTING
 ## -------
 # add a new column (showcase new feature in v1.8.11 for 'set')
 smple <- sample(letters[1:10], 2e7, TRUE)
 system.time(set(DT, i=NULL, j="e", value=smple)) # new feature in 1.8.11 - adding new column using set
 #   user  system elapsed 
 #  0.108   0.116   0.240 

 benchmark(dcast.data.table(DT, d ~ e, value.var="b", fun=sum))
 #      user.self sys.self elapsed user.child sys.child
 # [1,]    14.253    1.011  15.953          0         0
 # [2,]    14.149    1.002  15.648          0         0
 # [3,]    14.256    1.031  15.934          0         0

 #############################################################################################################
 # NOTE: We can't run this on current `reshape2` as it segfaults - because of function `split-numerics.cpp` 
 # in 'plyr'. I made the changes locally (from Hadley's email) and tested 'reshape2'. Here's the benchmarking
 # result. I ran it only once as it takes quite a long time. 
 #############################################################################################################

 # If you're interested in trying this, go here: http://gallery.rcpp.org/articles/plyr-c-to-rcpp/
 # Copy the C++ code and replace the function in src/split-numerics.c of "plyr"
 # You may also have to modify `plyr_split_indices` from RcppExports.cpp as well: 
 # here's the code: https://github.com/hadley/dplyr/blob/master/src/RcppExports.cpp
 # Replace "dplyr" accordingly. Then compile the package.

 system.time(out1 <- dcast(DT, d ~ e, value.var="b", fun=sum))
 #   user  system elapsed 
 # 41.697   4.941  46.887 
 
 out2 <- dcast.data.table(DT, d ~ e, value.var="b", fun=sum)
 out2.df <- as.data.frame(out2)
 setnames(out2.df, names(out1)) # set names same as out1
 identical(out1, out2.df) # [1] TRUE

 ###############################################################
 # max memory usage: 1.8GB (data.table) vs 1.8GB (reshape2)
 # dcast.data.table is faster than dcast by ~ > 3x times here.
 ###############################################################

 ## more columns on the LHS
 set(DT, i=NULL, j="f", value=sample(10, 2e7, TRUE))

 system.time(out1 <- dcast(DT, f+d ~ e, value.var="b", fun=length))
 #    user  system elapsed 
 # 174.912   8.143 184.422 

 system.time(out2 <- dcast.data.table(DT, f+d ~ e, value.var="b", fun=length))
 #   user  system elapsed 
 # 25.253   2.249  28.430 

 out2.df <- as.data.frame(out2)
 setnames(out2.df, names(out1)) # set names same as out1
 identical(out1, out2.df) # [1] TRUE

 ###############################################################
 # max memory usage: 2.98GB (data.table) vs 3.38GB (reshape2)
 # dcast.data.table is faster than dcast by ~ > 6.5x times here.
 ###############################################################

 # other relevant benchmarks:
 # https://gist.github.com/arunsrinivasan/7836512 - benchmarks on (mostly) "setkey" for 1.8.11
 # https://gist.github.com/arunsrinivasan/7832436 - benchmarks on (mostly) "setkey" for 1.8.10
 # https://gist.github.com/arunsrinivasan/7839002 - dplyr vs data.table 1.8.11 commit 1048
	require(reshape2)

	# data.table commit (1048)
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e5, foo())
	ch <- unique(ch)

	# > length(ch)
	# [1] 99982

	# DT now
	DT <- data.table(a = as.numeric(sample(c(NA, Inf, -Inf, NaN, rnorm(1e6)*1e6), N, TRUE)),
	b = as.numeric(sample(rnorm(1e6), N, TRUE)),
	c = sample(c(NA_integer_, 1e5:1e6), N, TRUE),
	d = sample(ch, N, TRUE))

	tables()
	# NAME NROW MB COLS KEY
	# [1,] DT 20,000,000 539 a,b,c,d
	# Total: 539MB

	# timing (run 3 times) - function borrowed from Hadley.
	benchmark <- function(code) {
	code <- substitute(code)

	rbind(
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame())),
	system.time(eval(code, parent.frame()))
	)
	}

	# r-session memory usage - 1GB

	## MELT
	## ----

	benchmark(melt(DT, id="d", measure=1:2))
	user.self sys.self elapsed user.child sys.child
	[1,] 2.885 0.595 3.554 0 0
	[2,] 1.898 0.579 2.516 0 0
	[3,] 1.894 0.562 2.492 0 0

	# compare against reshape2
	benchmark(reshape2:::melt.data.frame(DT, id="d", measure=1:2))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 227.841 11.725 304.533 0 0
	# [2,] 166.293 10.032 190.056 0 0
	# [3,] 170.237 10.364 195.621 0 0

	######################################################################
	# max memory used (mem footprint) - DT (1.7GB) vs reshape2 (4.7GB)
	# Speedup of ~ 75x!!!!
	######################################################################

	## CASTING
	## -------
	# add a new column (showcase new feature in v1.8.11 for 'set')
	smple <- sample(letters[1:10], 2e7, TRUE)
	system.time(set(DT, i=NULL, j="e", value=smple)) # new feature in 1.8.11 - adding new column using set
	# user system elapsed
	# 0.108 0.116 0.240

	benchmark(dcast.data.table(DT, d ~ e, value.var="b", fun=sum))
	# user.self sys.self elapsed user.child sys.child
	# [1,] 14.253 1.011 15.953 0 0
	# [2,] 14.149 1.002 15.648 0 0
	# [3,] 14.256 1.031 15.934 0 0

	#############################################################################################################
	# NOTE: We can't run this on current `reshape2` as it segfaults - because of function `split-numerics.cpp`
	# in 'plyr'. I made the changes locally (from Hadley's email) and tested 'reshape2'. Here's the benchmarking
	# result. I ran it only once as it takes quite a long time.
	#############################################################################################################

	# If you're interested in trying this, go here: http://gallery.rcpp.org/articles/plyr-c-to-rcpp/
	# Copy the C++ code and replace the function in src/split-numerics.c of "plyr"
	# You may also have to modify `plyr_split_indices` from RcppExports.cpp as well:
	# here's the code: https://github.com/hadley/dplyr/blob/master/src/RcppExports.cpp
	# Replace "dplyr" accordingly. Then compile the package.

	system.time(out1 <- dcast(DT, d ~ e, value.var="b", fun=sum))
	# user system elapsed
	# 41.697 4.941 46.887

	out2 <- dcast.data.table(DT, d ~ e, value.var="b", fun=sum)
	out2.df <- as.data.frame(out2)
	setnames(out2.df, names(out1)) # set names same as out1
	identical(out1, out2.df) # [1] TRUE

	###############################################################
	# max memory usage: 1.8GB (data.table) vs 1.8GB (reshape2)
	# dcast.data.table is faster than dcast by ~ > 3x times here.
	###############################################################

	## more columns on the LHS
	set(DT, i=NULL, j="f", value=sample(10, 2e7, TRUE))

	system.time(out1 <- dcast(DT, f+d ~ e, value.var="b", fun=length))
	# user system elapsed
	# 174.912 8.143 184.422

	system.time(out2 <- dcast.data.table(DT, f+d ~ e, value.var="b", fun=length))
	# user system elapsed
	# 25.253 2.249 28.430

	out2.df <- as.data.frame(out2)
	setnames(out2.df, names(out1)) # set names same as out1
	identical(out1, out2.df) # [1] TRUE

	###############################################################
	# max memory usage: 2.98GB (data.table) vs 3.38GB (reshape2)
	# dcast.data.table is faster than dcast by ~ > 6.5x times here.
	###############################################################

	# other relevant benchmarks:
	# https://gist.github.com/arunsrinivasan/7836512 - benchmarks on (mostly) "setkey" for 1.8.11
	# https://gist.github.com/arunsrinivasan/7832436 - benchmarks on (mostly) "setkey" for 1.8.10
	# https://gist.github.com/arunsrinivasan/7839002 - dplyr vs data.table 1.8.11 commit 1048