This script requires an Amazon EMR cluster with one master and three nodes:
library(sparklyr)
sc <- spark_connect(master = "yarn", spark_home = "/usr/lib/spark/", config = list(
spark.dynamicAllocation.enabled = FALSE,
`sparklyr.shell.executor-cores` = 8,
`sparklyr.shell.num-executors` = 3,
sparklyr.apply.env.WORKON_HOME = "/tmp/.virtualenvs"))
sdf_len(sc, 3, repartition = 3) %>%
spark_apply(function(df, barrier) { tryCatch({
library(tensorflow)
install_tensorflow()
nodes <- sort(as.character(barrier$address))
cluster <- tf$train$ClusterSpec(list(ps = list(nodes[1]), worker = as.list(nodes[-1])))
server <- tf$distribute$Server(cluster, job_name = "ps")
as.character(tf$constant("Hello World")) }, error = function(e) e$message)
}, barrier = TRUE, columns = c(address = "character")) %>%
collect()
References:
- https://github.com/tensorflow/examples/blob/master/community/en/docs/deploy/distributed.md
- https://stackoverflow.com/questions/39666845/how-does-tf-train-replica-device-setter-work
- https://stackoverflow.com/questions/41166681/what-does-global-step-mean-in-tensorflow
- https://stackoverflow.com/questions/33919948/how-to-set-adaptive-learning-rate-for-gradientdescentoptimizer