Created
March 23, 2018 20:01
-
-
Save grosscol/1fd690ddd9377c8a72ae333313c3061e to your computer and use it in GitHub Desktop.
Do parallel for A2MADS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(broom) | |
library(doParallel) | |
# set variables to define data | |
n_samples <- 1000000 | |
n_vars <- 10 | |
n_group1 <- 500 | |
n_group2 <- 2 | |
# set number of cores | |
cores <- 8 | |
# create mock data | |
df <- data.frame( | |
replicate(n_vars,rnorm(n_samples, 0, 1)), | |
label=rpois(n_samples, 10) | |
) | |
df <- df %>% | |
mutate( | |
group1=rep( | |
1:n_group1, | |
each=ceiling(n_samples/n_group1), | |
length.out=n_samples | |
), | |
group2=rep( | |
1:n_group2, | |
times=ceiling(n_samples/n_group2), | |
length.out=n_samples | |
) | |
) | |
# create table of groupings of mock data to use in foreach | |
groupings <- df %>% | |
select(group1, group2) %>% | |
distinct() | |
# create nested dataframe for "chunked" version | |
df_split <- df %>% | |
group_by(group1, group2) %>% | |
nest() %>% | |
mutate(chunk=(row_number() %% cores)+1) | |
# apply this function to groupings | |
lmCoeffs <- function(g1, g2, x){ | |
fit <- lm(label ~ ., data=x) | |
stats <- tidy(fit) | |
output <- data.frame( | |
group1=rep(g1, nrow(stats)), | |
group2=rep(g2, nrow(stats)), | |
stats | |
) | |
Sys.sleep(0.5) # add a little bit of time to more accruately reflect true runtime | |
return(output) | |
} | |
# serial processing with purrr::pmap | |
system.time({ | |
results1 <- df_split %>% | |
mutate(results=pmap(list(group1, group2, data), lmCoeffs)) %>% | |
select(-group1, -group2, -chunk) %>% | |
unnest(results) | |
}) | |
# parallel processing with foreach %dopar% for each grouping | |
cl <- makeCluster(cores) | |
registerDoParallel(cl, cores=cores) | |
system.time({ | |
results2 <- foreach( | |
i=1:nrow(groupings), | |
.packages=c("dplyr","broom"), # package dependencies for %dopar% | |
.combine=function(...) bind_rows(list(...)), | |
.multicombine = T | |
) %dopar% { | |
g1 <- groupings$group1[i] | |
g2 <- groupings$group2[i] | |
set <- df %>% filter(group1==g1, group2==g2) | |
lmCoeffs(g1, g2, set) | |
} | |
}) | |
stopCluster(cl) | |
# parallel chunk processing with embedded purrr::map | |
cl <- makeCluster(cores) | |
registerDoParallel(cl, cores=cores) | |
system.time({ | |
results3 <- foreach( | |
i=1:cores, | |
.packages=c("tidyr","dplyr","broom","purrr"), | |
.combine=function(...) bind_rows(list(...)), | |
.multicombine = T | |
) %dopar% { | |
df_split %>% | |
filter(chunk==i) %>% | |
mutate(results=pmap(list(group1, group2, data), lmCoeffs)) %>% | |
select(results) %>% | |
unnest(results) | |
} | |
}) | |
stopCluster(cl) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment