Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save dantonnoriega/87d41db62fc9637abd1447eaedd7613c to your computer and use it in GitHub Desktop.
Save dantonnoriega/87d41db62fc9637abd1447eaedd7613c to your computer and use it in GitHub Desktop.
Example setting up a mixed remote / local cluster using the `future` package. Includes simple examples of how to properly execute plan to maximize cores. Note that this does not show how important it is to have the same R version AND package versions across all nodes.
# set up ---------------------------
library(furrr)
# use multiple clusters
ssh_username <- 'drn'
remote_ssh_configs <- c('a', 'b', 'e') # names for remote server (found in ~/.ssh/config e.g. Host a)
local_comp <- Sys.info()[["nodename"]] # get local computer name
# build cluster ----------------------------------------------------------------
system(command = "ps -axc | grep ssh | awk '{print $1}' | sort -u | xargs kill")
remote_cl <- future::makeClusterPSOCK(
workers = remote_ssh_configs,
user = ssh_username,
rscript = c("/usr/bin/Rscript"), # use whatever is linked to Rscript on server
# Actually run this stuff. Set to TRUE if you don't want it to run remotely.
rscript_args = '--vanilla',
dryrun = FALSE
)
local_cl <- future::makeClusterPSOCK(
workers = local_comp,
user = 'danton',
rscript = c("/usr/local/bin/Rscript"), # use whatever is linked to Rscript on server
# Actually run this stuff. Set to TRUE if you don't want it to run remotely.
rscript_args = '--vanilla',
dryrun = FALSE
)
## setup future plan
## `future::multisession` more dependable cross platform or when one encounters errors due to BLAS optimization drivers but can be slower
clust <- future::tweak(future::cluster, workers = c(remote_cl,local_cl))
cores <- future::tweak(future::multisession, workers = future::availableCores)
future::plan(list(clust, cores))
# examples -------------------------------------------------------------------
## uses all clusters but does not parallelize because doesn't pass the first nested layer in plan (only hits `clust`)
a <- furrr::future_map(1:1e4, ~list(node = Sys.info()[["nodename"]], pid = Sys.getpid()))
a %>%
dplyr::bind_rows() %>%
dplyr::count(node, pid)
## uses all clusters and parallelizes due to nesting of second future_map which executes the multiprocess plan
b <- furrr::future_map(.x = 1:6, ~{ # (1) hits first plan, `clust` (future::cluster)
y = .x
furrr::future_map(1:1e4, ~{ # (2) hits second plan, `cores` (future::multiprocess)
list(node = Sys.info()[["nodename"]], run = y, pid = Sys.getpid())
})
})
purrr::map(b, ~{.x %>%
dplyr::bind_rows() %>%
dplyr::count(node, run, pid)
}) %>%
dplyr::bind_rows()
## can assign to one node at a time, sequentially, allowing different functions to be assigned to different nodes
print(nodes)
library(future)
n1 %<-% { ## assigned to node 'a' (i.e. remote_ssh_configs[1])
furrr::future_map(1:1e4, ~{
list(node = Sys.info()[["nodename"]], pid = Sys.getpid())
})
}
n2 %<-% { ## assigned to node 'b' (i.e. remote_ssh_configs[2])
furrr::future_map(1:1e4, ~{
list(node = Sys.info()[["nodename"]], pid = Sys.getpid())
})
}
n3 %<-% { ## assigned to node 'e' (i.e. remote_ssh_configs[3])
furrr::future_map(1:1e4, ~{
list(node = Sys.info()[["nodename"]], pid = Sys.getpid())
})
}
n4 %<-% { ## assigned to local node (i.e. local_comp)
furrr::future_map(1:1e4, ~{
list(node = Sys.info()[["nodename"]], pid = Sys.getpid())
})
}
purrr::map(list(n1, n2, n3, n4), ~{.x %>%
dplyr::bind_rows() %>%
dplyr::count(node,pid)
}) %>%
dplyr::bind_rows()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment