Last active
January 26, 2021 17:54
-
-
Save dantonnoriega/87d41db62fc9637abd1447eaedd7613c to your computer and use it in GitHub Desktop.
Example setting up a mixed remote / local cluster using the `future` package. Includes simple examples of how to properly execute plan to maximize cores. Note that this does not show how important it is to have the same R version AND package versions across all nodes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set up --------------------------- | |
library(furrr) | |
# use multiple clusters | |
ssh_username <- 'drn' | |
remote_ssh_configs <- c('a', 'b', 'e') # names for remote server (found in ~/.ssh/config e.g. Host a) | |
local_comp <- Sys.info()[["nodename"]] # get local computer name | |
# build cluster ---------------------------------------------------------------- | |
system(command = "ps -axc | grep ssh | awk '{print $1}' | sort -u | xargs kill") | |
remote_cl <- future::makeClusterPSOCK( | |
workers = remote_ssh_configs, | |
user = ssh_username, | |
rscript = c("/usr/bin/Rscript"), # use whatever is linked to Rscript on server | |
# Actually run this stuff. Set to TRUE if you don't want it to run remotely. | |
rscript_args = '--vanilla', | |
dryrun = FALSE | |
) | |
local_cl <- future::makeClusterPSOCK( | |
workers = local_comp, | |
user = 'danton', | |
rscript = c("/usr/local/bin/Rscript"), # use whatever is linked to Rscript on server | |
# Actually run this stuff. Set to TRUE if you don't want it to run remotely. | |
rscript_args = '--vanilla', | |
dryrun = FALSE | |
) | |
## setup future plan | |
## `future::multisession` more dependable cross platform or when one encounters errors due to BLAS optimization drivers but can be slower | |
clust <- future::tweak(future::cluster, workers = c(remote_cl,local_cl)) | |
cores <- future::tweak(future::multisession, workers = future::availableCores) | |
future::plan(list(clust, cores)) | |
# examples ------------------------------------------------------------------- | |
## uses all clusters but does not parallelize because doesn't pass the first nested layer in plan (only hits `clust`) | |
a <- furrr::future_map(1:1e4, ~list(node = Sys.info()[["nodename"]], pid = Sys.getpid())) | |
a %>% | |
dplyr::bind_rows() %>% | |
dplyr::count(node, pid) | |
## uses all clusters and parallelizes due to nesting of second future_map which executes the multiprocess plan | |
b <- furrr::future_map(.x = 1:6, ~{ # (1) hits first plan, `clust` (future::cluster) | |
y = .x | |
furrr::future_map(1:1e4, ~{ # (2) hits second plan, `cores` (future::multiprocess) | |
list(node = Sys.info()[["nodename"]], run = y, pid = Sys.getpid()) | |
}) | |
}) | |
purrr::map(b, ~{.x %>% | |
dplyr::bind_rows() %>% | |
dplyr::count(node, run, pid) | |
}) %>% | |
dplyr::bind_rows() | |
## can assign to one node at a time, sequentially, allowing different functions to be assigned to different nodes | |
print(nodes) | |
library(future) | |
n1 %<-% { ## assigned to node 'a' (i.e. remote_ssh_configs[1]) | |
furrr::future_map(1:1e4, ~{ | |
list(node = Sys.info()[["nodename"]], pid = Sys.getpid()) | |
}) | |
} | |
n2 %<-% { ## assigned to node 'b' (i.e. remote_ssh_configs[2]) | |
furrr::future_map(1:1e4, ~{ | |
list(node = Sys.info()[["nodename"]], pid = Sys.getpid()) | |
}) | |
} | |
n3 %<-% { ## assigned to node 'e' (i.e. remote_ssh_configs[3]) | |
furrr::future_map(1:1e4, ~{ | |
list(node = Sys.info()[["nodename"]], pid = Sys.getpid()) | |
}) | |
} | |
n4 %<-% { ## assigned to local node (i.e. local_comp) | |
furrr::future_map(1:1e4, ~{ | |
list(node = Sys.info()[["nodename"]], pid = Sys.getpid()) | |
}) | |
} | |
purrr::map(list(n1, n2, n3, n4), ~{.x %>% | |
dplyr::bind_rows() %>% | |
dplyr::count(node,pid) | |
}) %>% | |
dplyr::bind_rows() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment