Created
June 25, 2019 15:38
-
-
Save genomewalker/1b681d85eedca415993684e1d1cea2c2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
node_list = Sys.getenv("SLURM_NODELIST") | |
cat("SLURM nodes:", node_list, "\n") | |
# Loop up IPs of the allocated nodes. | |
if (node_list != "") { | |
nodes = strsplit(node_list, ",")[[1]] | |
ips = rep(NA, length(nodes)) | |
for (i in 1:length(nodes)) { | |
args = c(nodes[i], " | awk '/has address/ { print $4 ; exit }'") | |
result = system2("host", args = args, stdout = T) | |
# Extract the IP from the result output. | |
ips[i] = sub("^([^ ]+) +.*$", "\\1", result, perl = T) | |
} | |
cat("SLURM IPs:", paste(ips, collapse=", "), "\n") | |
# Combine into a network string for h2o. | |
network = paste0(paste0(ips, "/32"), collapse=",") | |
cat("Network:", network, "\n") | |
} | |
# Specify how many nodes we want h2o to use. | |
h2o_num_nodes = length(ips) | |
# Options to pass to java call: | |
args = c( | |
# -Xmx30g allocate 30GB of RAM per node. Needs to come before "-jar" | |
"-Xmx100g", | |
# Specify path to downloaded h2o jar. | |
"-jar /vol/cloud/osd2014/h2o-3.24.0.5/h2o.jar", | |
# Specify a cloud name for the cluster. | |
"-name h2o_r", | |
"-port 55599", | |
# Specify IPs of other nodes. | |
paste("-network", network) | |
) | |
cat(paste0("Args:\n", paste(args, collapse="\n"), "\n")) | |
# Run once for each node we want to start. | |
for (node_i in 1:h2o_num_nodes) { | |
cat("\nLaunching h2o worker on", ips[node_i], "\n") | |
new_args = c(ips[node_i], "java", args) | |
# Ssh into the target IP and launch an h2o worker with its own | |
# output and error files. These could go in a subdirectory. | |
cmd_result = system2("ssh", args = new_args, | |
stdout = paste0("h2o_out_", node_i, ".txt"), | |
stderr = paste0("h2o_err_", node_i, ".txt"), | |
# Need to specify wait=F so that it runs in the background. | |
wait = F) | |
# This should be 0. | |
cat("Cmd result:", cmd_result, "\n") | |
# Wait one second between inits. | |
Sys.sleep(1L) | |
} | |
# Wait 3 more seconds to find all the nodes, otherwise we may only | |
# find the node on localhost. | |
Sys.sleep(3L) | |
# Check if h2o is running. We will see ssh processes and one java process. | |
system2("ps", c("-ef", "| grep h2o.jar"), stdout = T) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment