Parallel GLM tuning

Benjamin Chan
r Sys.time()

Clear the workspace environment and load packages.

rm(list=ls())
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 302782 16.2     592000 31.7   350000 18.7
## Vcells 493496  3.8    1023718  7.9   786430  6.0

if (!require(devtools)) {install.packages("devtools")}

## Loading required package: devtools

library(devtools)
source_gist("https://gist.github.com/benjamin-chan/3b59313e8347fffea425")

## Sourcing https://gist.githubusercontent.com/benjamin-chan/3b59313e8347fffea425/raw/c03cd15480a6444399ff5f34892f5911d6e12b44/loadPkg.R
## SHA-1 hash of file is 0bb1bb041c1dda15ee613db701d0c5784d1196a5

loadPkg("doParallel")

## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel

loadPkg("data.table")

## Loading required package: data.table

.timeStart <- Sys.time()

Set up the test data.

J <- 100  # This is the number of models to fit
N <- 1E5  # This is the size of the dataset
i <- rep(1:N, each=J)
D <- data.table(i,  # id
                j = rep(1:J, N),  # index repitition
                x1 = rep(rbinom(N, 1, 0.5), each=J),  # group membership, adult/child
                x2 = rnorm(N * J),  # fake risk factor
                x3 = rnorm(N * J),  # fake risk factor
                x4 = rbinom(N * J, 1, 0.5),  # fake risk factor
                x5 = rbinom(N * J, 1, 0.5))  # fake risk factor
D <- D[, logitp := -5 + x1 + x2 + x3 + x4 + x5]
D <- D[, p := exp(logitp) / (1 + exp(logitp))]
D <- D[, y := rbinom(N * J, 1, p)]
D <- rbind(D[, k := j], 
           D[x1 == 0, ][, k := as.integer(J * (x1 + 1) + j)], 
           D[x1 == 1, ][, k := as.integer(J * (x1 + 1) + j)])
setkey(D, k, i, j)
rm(i)

Define the wrapper functions.

model <- function (D) {
  require(data.table)
  fx <- formula(y ~ x1 + x2 + x3 + x4 + x5)
  M <- glm(fx, data=D, family=binomial)
  fitted(M)
}
summarize <- function (t, p) {
  data.frame(workers=workers,
             user = t[1],
             sys = t[2],
             elapsed = t[3],
             meanPhat = mean(unlist(p)),
             row.names = NULL)
}

Run with various number of workers.

workers <- 12
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %dopar% {
    model(D[k == K, ])
  }
)
stopCluster(cl)
n12 <- summarize(ptime, phat)
rm(ptime, phat)
gc()

workers <- 8
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %dopar% {
    model(D[k == K, ])
  }
)
stopCluster(cl)
n8 <- summarize(ptime, phat)
rm(ptime, phat)
gc()

workers <- 6
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %dopar% {
    model(D[k == K, ])
  }
)
stopCluster(cl)
n6 <- summarize(ptime, phat)
rm(ptime, phat)
gc()

workers <- 5
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %dopar% {
    model(D[k == K, ])
  }
)
stopCluster(cl)
n5 <- summarize(ptime, phat)
rm(ptime, phat)
gc()

workers <- 4
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %dopar% {
    model(D[k == K, ])
  }
)
stopCluster(cl)
n4 <- summarize(ptime, phat)
rm(ptime, phat)
gc()

workers <- 3
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %dopar% {
    model(D[k == K, ])
  }
)
stopCluster(cl)
n3 <- summarize(ptime, phat)
rm(ptime, phat)
gc()

##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    556954   29.8     940480   50.3    940480   50.3
## Vcells 150808545 1150.6  349102527 2663.5 321046041 2449.4

workers <- 2
cl <- makeCluster(workers)
registerDoParallel(cl, cores=workers)
ptime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %dopar% {
    model(D[k == K, ])
  }
)
stopCluster(cl)
n2 <- summarize(ptime, phat)
rm(ptime, phat)
gc()

##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    556900   29.8     940480   50.3    940480   50.3
## Vcells 150808600 1150.6  349102527 2663.5 321046041 2449.4

workers <- 1
stime <- system.time(
  phat <- foreach(K = 1:(J * 3)) %do% {
    model(D[k == K, ])
  }
)
n1 <- summarize(stime, phat)
rm(stime, phat)
gc()

##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    567333   30.3    1168576   62.5    940480   50.3
## Vcells 150828353 1150.8  349102527 2663.5 349101411 2663.5

Summarize.

data.table(models=J,
           size=N,
           # rbind(n12, n8, n6, n5, n4, n3, n2, n1))
           rbind(n3, n2, n1))

##    models  size workers   user   sys elapsed meanPhat
## 1:    100 1e+05       3  10.75  5.12  229.73 0.074669
## 2:    100 1e+05       2   9.11  3.82  201.67 0.074669
## 3:    100 1e+05       1 595.55 86.47  197.01 0.074669

Session info

## Sourcing https://gist.githubusercontent.com/benjamin-chan/80149dd4cdb16b2760ec/raw/a1fafde5c5086024dd01d410cc2f72fb82e93f26/sessionInfo.R
## SHA-1 hash of file is 41209357693515acefb05d4b209340e744a1cbe4

benjamin-chan/parallelGLMTuning.md

Parallel GLM tuning

benjamin-chan commented Nov 19, 2015

Uh oh!