Skip to content

Instantly share code, notes, and snippets.

@gghatano
Created January 10, 2014 00:01
Show Gist options
  • Save gghatano/8344475 to your computer and use it in GitHub Desktop.
Save gghatano/8344475 to your computer and use it in GitHub Desktop.
plyr vs doBy speed test and visualize
# 参考文献:
# http://d.hatena.ne.jp/dichika/20140103/p1
library(devtools)
#install_github("dplyr") # うまくいかない
library(plyr)
library(doBy)
library(reshape2)
library(ggplot2)
set.seed(42)
# データフレームの作成
types <- c("A","B","C","D","E")
obs <- 4e+06
dat <- data.frame(type = as.factor(sample(types, obs, replace=TRUE)))
# データフレームの列数を増やしながら, 処理時間を計測
Nmax <- 10
plyr_time <- 0
doBy_time <- 0
for (N in 1:Nmax){
dat[,N+1] <- round(runif(obs, min=0, max = 1), digits = 2)
names(dat)[N+1] <- paste("value", N, sep="")
#print(object.size(dat), units ="MB")
plyr_time[N] <- system.time(
plyr_res <- ddply(dat, .(type), summarize,
mean_percent = mean(value1))
)[3]
doBy_time[N] <- system.time(
doBy_res <- summaryBy(value1 ~ type, data = dat, FUN = mean)
)[3]
}
# 結果をデータフレームに格納してggplotする
res <- data.frame(plyr=plyr_time, doBy=doBy_time)
res <- melt(res)
res$colsize <- rep(2:(Nmax+1), 2)
names(res) <- c("method", "time", "col.size")
ggplot(data = res, aes(x = col.size, y= time, col = method)) +
geom_line(size = 2) +
geom_point(size = 8)+
ggtitle("plyr vs doBy") +
theme(text=element_text(size=16)) +
ylab("time (sec)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment