Last active
October 17, 2016 22:14
-
-
Save arunsrinivasan/6140225 to your computer and use it in GitHub Desktop.
data.table version of rbind.fill benchmarking with plyr version of rbind.fill
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The post with benchmarking results is the link given below: | |
# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698 | |
# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it. | |
# Note: it takes about 2-3 hours for the benchmarking to finish. | |
require(plyr) | |
require(data.table) | |
require(ggplot2) | |
require(microbenchmark) | |
# data.table version of rbind.fill (first/rough version, improvements should be possible) | |
rbind.fill.DT <- function(ll) { | |
all.names <- lapply(ll, names) # changed sapply to lapply to return a list always | |
unq.names <- unique(unlist(all.names)) | |
ll.m <- rbindlist(lapply(seq_along(ll), function(x) { | |
tt <- ll[[x]] | |
setattr(tt, 'class', c('data.table', 'data.frame')) | |
data.table:::settruelength(tt, 0L) | |
invisible(alloc.col(tt)) | |
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_] | |
setcolorder(tt, unq.names) | |
})) | |
} | |
# plyr rbind.fill | |
rbind.fill.PLYR <- function(ll) { | |
rbind.fill(ll) | |
} | |
# Function to generate sample data of varying list length | |
set.seed(45) | |
sample.fun <- function() { | |
nam <- sample(LETTERS, sample(5:15)) | |
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10)) | |
setNames(val, nam) | |
} | |
vals <- seq(1000, 10000, by=1000) | |
timings <- rbindlist(lapply(vals, function(x) { | |
print(x) | |
ll <- replicate(x, sample.fun()) | |
ww <- microbenchmark(t1 <- rbind.fill.DT(ll), | |
t2 <- rbind.fill.PLYR(ll), times=10) | |
data.table(Time = ww$time/1e9, | |
Type = ww$expr)[, list(Median = median(Time), | |
Max = max(Time), Min = min(Time)), by=Type] | |
})) | |
# This is the data I obtained after the time-consuming run | |
> timings | |
# Type Median Max Min | |
# 1: t1 <- rbind.fill.DT(ll) 0.9954604 1.048112 0.9857269 | |
# 2: t2 <- rbind.fill.PLYR(ll) 1.2641116 1.338004 1.2212289 | |
# 3: t2 <- rbind.fill.PLYR(ll) 8.4721998 8.735089 5.9465172 | |
# 4: t1 <- rbind.fill.DT(ll) 2.0714826 2.197611 2.0515592 | |
# 5: t2 <- rbind.fill.PLYR(ll) 17.6816796 36.607170 15.4232463 | |
# 6: t1 <- rbind.fill.DT(ll) 3.3759282 3.420131 3.2041830 | |
# 7: t2 <- rbind.fill.PLYR(ll) 34.2991906 94.734541 23.3022808 | |
# 8: t1 <- rbind.fill.DT(ll) 4.6182805 5.113477 4.5497483 | |
# 9: t2 <- rbind.fill.PLYR(ll) 40.7826024 123.284631 32.9824400 | |
# 10: t1 <- rbind.fill.DT(ll) 5.4668071 6.506895 5.1054583 | |
# 11: t2 <- rbind.fill.PLYR(ll) 54.8779719 158.539573 41.0948270 | |
# 12: t1 <- rbind.fill.DT(ll) 6.9966963 7.298266 6.7938445 | |
# 13: t1 <- rbind.fill.DT(ll) 7.8084107 8.513016 7.2517920 | |
# 14: t2 <- rbind.fill.PLYR(ll) 70.8803392 154.592278 62.8290179 | |
# 15: t2 <- rbind.fill.PLYR(ll) 113.2118155 145.739713 91.2401254 | |
# 16: t1 <- rbind.fill.DT(ll) 8.6552054 10.291779 8.2413973 | |
# 17: t2 <- rbind.fill.PLYR(ll) 136.0497395 283.447510 102.8063142 | |
# 18: t1 <- rbind.fill.DT(ll) 10.4087178 11.862445 10.0815655 | |
# 19: t1 <- rbind.fill.DT(ll) 11.8507923 12.687897 11.4170676 | |
# 20: t2 <- rbind.fill.PLYR(ll) 202.2200270 328.348136 171.4381696 | |
# plot it | |
timings[, Type := ifelse(grepl("DT", Type), "DT", "PLYR")][, List_Length := rep(seq(1000, 1e4, by=1000), each=2)] | |
pp <- ggplot(data = timings, aes(x = List_Length, y = Median, colour = Type)) + geom_line() + geom_point() | |
pp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@arunsrinivasan, does
invisible
do anything here?https://gist.github.com/arunsrinivasan/6140225#file-rbind_fill_benchmarking-L20