Skip to content

Instantly share code, notes, and snippets.

@grosscol
Created March 23, 2018 20:01
Show Gist options
  • Save grosscol/1fd690ddd9377c8a72ae333313c3061e to your computer and use it in GitHub Desktop.
Save grosscol/1fd690ddd9377c8a72ae333313c3061e to your computer and use it in GitHub Desktop.
Do parallel for A2MADS
library(tidyverse)
library(broom)
library(doParallel)
# set variables to define data
n_samples <- 1000000
n_vars <- 10
n_group1 <- 500
n_group2 <- 2
# set number of cores
cores <- 8
# create mock data
df <- data.frame(
replicate(n_vars,rnorm(n_samples, 0, 1)),
label=rpois(n_samples, 10)
)
df <- df %>%
mutate(
group1=rep(
1:n_group1,
each=ceiling(n_samples/n_group1),
length.out=n_samples
),
group2=rep(
1:n_group2,
times=ceiling(n_samples/n_group2),
length.out=n_samples
)
)
# create table of groupings of mock data to use in foreach
groupings <- df %>%
select(group1, group2) %>%
distinct()
# create nested dataframe for "chunked" version
df_split <- df %>%
group_by(group1, group2) %>%
nest() %>%
mutate(chunk=(row_number() %% cores)+1)
# apply this function to groupings
lmCoeffs <- function(g1, g2, x){
fit <- lm(label ~ ., data=x)
stats <- tidy(fit)
output <- data.frame(
group1=rep(g1, nrow(stats)),
group2=rep(g2, nrow(stats)),
stats
)
Sys.sleep(0.5) # add a little bit of time to more accruately reflect true runtime
return(output)
}
# serial processing with purrr::pmap
system.time({
results1 <- df_split %>%
mutate(results=pmap(list(group1, group2, data), lmCoeffs)) %>%
select(-group1, -group2, -chunk) %>%
unnest(results)
})
# parallel processing with foreach %dopar% for each grouping
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
system.time({
results2 <- foreach(
i=1:nrow(groupings),
.packages=c("dplyr","broom"), # package dependencies for %dopar%
.combine=function(...) bind_rows(list(...)),
.multicombine = T
) %dopar% {
g1 <- groupings$group1[i]
g2 <- groupings$group2[i]
set <- df %>% filter(group1==g1, group2==g2)
lmCoeffs(g1, g2, set)
}
})
stopCluster(cl)
# parallel chunk processing with embedded purrr::map
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
system.time({
results3 <- foreach(
i=1:cores,
.packages=c("tidyr","dplyr","broom","purrr"),
.combine=function(...) bind_rows(list(...)),
.multicombine = T
) %dopar% {
df_split %>%
filter(chunk==i) %>%
mutate(results=pmap(list(group1, group2, data), lmCoeffs)) %>%
select(results) %>%
unnest(results)
}
})
stopCluster(cl)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment