library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.0 ──
#> ✓ broom 0.5.5 ✓ recipes 0.1.9
#> ✓ dials 0.0.4 ✓ rsample 0.0.5
#> ✓ dplyr 0.8.5 ✓ tibble 2.1.3
#> ✓ ggplot2 3.3.0 ✓ tune 0.0.1
#> ✓ infer 0.5.1 ✓ workflows 0.1.1
#> ✓ parsnip 0.0.5 ✓ yardstick 0.0.6
#> ✓ purrr 0.3.3
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x ggplot2::margin() masks dials::margin()
#> x recipes::step() masks stats::step()
#> x recipes::yj_trans() masks scales::yj_trans()
rec <- recipe(mpg ~ ., data = mtcars)
mdl <- rand_forest() %>%
set_engine("ranger")
wflow <- workflow() %>%
add_recipe(rec) %>%
add_model(mdl)
set.seed(123)
model1 <- fit(wflow, mtcars)
set.seed(123)
model2 <- fit(wflow, mtcars)
## Iterate over a list and hash each list item
# crc32 is only used as it produces a short hash
list_hash <- function(x, algo = "crc32") sapply(x, digest::digest, algo = algo)
comparing the different hashes, we see that only model$fit
changes
list(
model1 = list_hash(model1),
model2 = list_hash(model2)
)
#> $model1
#> pre fit post trained
#> "70872185" "ad3ca3be" "25ebfb78" "a49c55d7"
#>
#> $model2
#> pre fit post trained
#> "70872185" "f30d6fdf" "25ebfb78" "a49c55d7"
comparing the different hashes, we see that only model$fit$fit
changes
list(
model1 = list_hash(model1$fit),
model2 = list_hash(model2$fit)
)
#> $model1
#> actions fit
#> "25b54439" "13ce6ea7"
#>
#> $model2
#> actions fit
#> "25b54439" "11cc2d41"
comparing the different hashes, we see that only model$fit$fit$elapsed
changes
list(
model1 = list_hash(model1$fit$fit),
model2 = list_hash(model2$fit$fit)
)
#> $model1
#> lvl spec fit preproc elapsed
#> "7b410007" "627da451" "e9524435" "9303470a" "060e0968"
#>
#> $model2
#> lvl spec fit preproc elapsed
#> "7b410007" "627da451" "e9524435" "9303470a" "c30f17f1"
## hash the model without the elapsed time
# crc32 is only used as it produces a short hash
hash_model <- function(x, algo = "crc32") {
x$fit$fit$elapsed <- NA
ll <- list(
x$pre,
x$fit,
x$post,
x$trained
)
digest::digest(ll, algo)
}
# Now the hashes are identical
hash_model(model1)
#> [1] "0c35cd41"
hash_model(model2)
#> [1] "0c35cd41"
Created on 2020-03-19 by the reprex package (v0.3.0)
When we save & load a workflow, add_step()
calls (saved as a quosure) will receive a new environment internally, thus the hash of the model will change, although the model does not change.
An updated version of the hash_model
function would look like this:
hash_model <- function(x, algo = "crc32") {
x$fit$fit$elapsed <- NA
# convert quosures to labels.
remove_quos <- function(x) lapply(x, as_label)
x$pre$actions$recipe$recipe$steps <- remove_quos(x$pre$actions$recipe$recipe$steps)
x$pre$mold$blueprint$recipe$steps <- remove_quos(x$pre$mold$blueprint$recipe$steps)
# post steps not totally clear...
x$post$actions$recipe$recipe$steps <- remove_quos(x$post$actions$recipe$recipe$steps)
x$post$actions$blueprint$recipe$steps <- remove_quos(x$post$actions$blueprint$recipe$steps)
digest::digest(x, algo)
}
tmp <- tempfile()
saveRDS(model, tmp)
model2 <- readRDS(tmp)
hash_model(model)
hash_model(model2)
As a comment to the addendum, here a script that looks at all differing slots:
Created on 2020-04-22 by the reprex package (v0.3.0)