title | output |
---|---|
Using Spark, Delta Lake and MLflow |
html_notebook |
First we define helper functions,
delta_version <- function(sc, path) {
invoke_static(sc, "io.delta.tables.DeltaTable", "forPath", spark_session(sc), path) %>%
invoke("history") %>%
sdf_register() %>%
dplyr::arrange(dplyr::desc(version)) %>%
dplyr::pull(version) %>% `[`(1)
}
Then we connect to Spark and load Delta Lake,
library(sparklyr)
sc <- spark_connect(master = "local", version = "2.4", package = "delta")
library(glmnet)
wine_file <- pins::pin("https://raw.githubusercontent.com/rstudio/mlflow-example/master/wine-quality.csv")
train <- spark_read_csv(sc, wine_file)
spark_write_delta(train, "test", mode = "overwrite")
library(mlflow)
with(mlflow_start_run(), {
# set experiments tags
mlflow_set_tag("project", "wines")
# log data version
mlflow_log_param("delta", delta_version(sc, "test"))
# log parameters
lambda <- mlflow_log_param("lambda", 0.5) %>% as.numeric()
# build model
model <- ml_generalized_linear_regression(train, quality ~ ., reg_param = lambda)
metrics <- ml_evaluate(model, train)
# log metrics
mlflow_log_metric("aic", metrics$aic())
mlflow_log_metric("deviance", metrics$deviance())
mlflow_log_metric("dispersion", metrics$dispersion())
})
mlflow_ui()
spark_disconnect(sc)