Benjamin Chan
r Sys.time()
Clear the workspace environment and load packages.
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 302782 16.2 592000 31.7 350000 18.7
## Vcells 493496 3.8 1023718 7.9 786430 6.0
if (!require(devtools)) {install.packages("devtools")}
## Loading required package: devtools
library(devtools)
source_gist("https://gist.github.com/benjamin-chan/3b59313e8347fffea425")
## Sourcing https://gist.githubusercontent.com/benjamin-chan/3b59313e8347fffea425/raw/c03cd15480a6444399ff5f34892f5911d6e12b44/loadPkg.R
## SHA-1 hash of file is 0bb1bb041c1dda15ee613db701d0c5784d1196a5
loadPkg("doParallel")
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
loadPkg("data.table")
## Loading required package: data.table
.timeStart <- Sys.time()
Set up the test data.
J <- 100 # This is the number of models to fit
N <- 1E5 # This is the size of the dataset
i <- rep(1:N, each=J)
D <- data.table(i, # id
j = rep(1:J, N), # index repitition
x1 = rep(rbinom(N, 1, 0.5), each=J), # group membership, adult/child
x2 = rnorm(N * J), # fake risk factor
x3 = rnorm(N * J), # fake risk factor
x4 = rbinom(N * J, 1, 0.5), # fake risk factor
x5 = rbinom(N * J, 1, 0.5)) # fake risk factor
D <- D[, logitp := -5 + x1 + x2 + x3 + x4 + x5]
D <- D[, p := exp(logitp) / (1 + exp(logitp))]
D <- D[, y := rbinom(N * J, 1, p)]
D <- rbind(D[, k := j],
D[x1 == 0, ][, k := as.integer(J * (x1 + 1) + j)],
D[x1 == 1, ][, k := as.integer(J * (x1 + 1) + j)])
setkey(D, k, i, j)
rm(i)
Define the wrapper functions.
model <- function (D) {
require(data.table)
fx <- formula(y ~ x1 + x2 + x3 + x4 + x5)
M <- glm(fx, data=D, family=binomial)
fitted(M)
}
summarize <- function (t, p) {
data.frame(workers=workers,
user = t[1],
sys = t[2],
elapsed = t[3],
meanPhat = mean(unlist(p)),
row.names = NULL)
}
Run with various number of workers.
workers <- 12
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
phat <- foreach(K = 1:(J * 3)) %dopar% {
model(D[k == K, ])
}
)
stopCluster(cl)
n12 <- summarize(ptime, phat)
rm(ptime, phat)
gc()
workers <- 8
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
phat <- foreach(K = 1:(J * 3)) %dopar% {
model(D[k == K, ])
}
)
stopCluster(cl)
n8 <- summarize(ptime, phat)
rm(ptime, phat)
gc()
workers <- 6
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
phat <- foreach(K = 1:(J * 3)) %dopar% {
model(D[k == K, ])
}
)
stopCluster(cl)
n6 <- summarize(ptime, phat)
rm(ptime, phat)
gc()
workers <- 5
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
phat <- foreach(K = 1:(J * 3)) %dopar% {
model(D[k == K, ])
}
)
stopCluster(cl)
n5 <- summarize(ptime, phat)
rm(ptime, phat)
gc()
workers <- 4
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
phat <- foreach(K = 1:(J * 3)) %dopar% {
model(D[k == K, ])
}
)
stopCluster(cl)
n4 <- summarize(ptime, phat)
rm(ptime, phat)
gc()
workers <- 3
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
phat <- foreach(K = 1:(J * 3)) %dopar% {
model(D[k == K, ])
}
)
stopCluster(cl)
n3 <- summarize(ptime, phat)
rm(ptime, phat)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 556954 29.8 940480 50.3 940480 50.3
## Vcells 150808545 1150.6 349102527 2663.5 321046041 2449.4
workers <- 2
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
phat <- foreach(K = 1:(J * 3)) %dopar% {
model(D[k == K, ])
}
)
stopCluster(cl)
n2 <- summarize(ptime, phat)
rm(ptime, phat)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 556900 29.8 940480 50.3 940480 50.3
## Vcells 150808600 1150.6 349102527 2663.5 321046041 2449.4
workers <- 1
stime <- system.time(
phat <- foreach(K = 1:(J * 3)) %do% {
model(D[k == K, ])
}
)
n1 <- summarize(stime, phat)
rm(stime, phat)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 567333 30.3 1168576 62.5 940480 50.3
## Vcells 150828353 1150.8 349102527 2663.5 349101411 2663.5
Summarize.
data.table(models=J,
size=N,
# rbind(n12, n8, n6, n5, n4, n3, n2, n1))
rbind(n3, n2, n1))
## models size workers user sys elapsed meanPhat
## 1: 100 1e+05 3 10.75 5.12 229.73 0.074669
## 2: 100 1e+05 2 9.11 3.82 201.67 0.074669
## 3: 100 1e+05 1 595.55 86.47 197.01 0.074669
Session info
## Sourcing https://gist.githubusercontent.com/benjamin-chan/80149dd4cdb16b2760ec/raw/a1fafde5c5086024dd01d410cc2f72fb82e93f26/sessionInfo.R
## SHA-1 hash of file is 41209357693515acefb05d4b209340e744a1cbe4
Tune with Revolution R Open 3.2.2 on my 4-core, 2.8GHz, 24 GB desktop