Skip to content

Instantly share code, notes, and snippets.

@malcolmbarrett
Created September 10, 2023 21:21
Show Gist options
  • Save malcolmbarrett/6156875e781206bfdfe7147414d5016f to your computer and use it in GitHub Desktop.
Save malcolmbarrett/6156875e781206bfdfe7147414d5016f to your computer and use it in GitHub Desktop.
#set.seed(123) # Set seed for reproducibility
library(tidymodels)
fit_models <- function(n = 100) {
# Step 2: Create 10 predictors, some of which are noisy or highly correlated
x1 <- rnorm(n)
x2 <- rnorm(n)
x3 <- rnorm(n)
x4 <- rnorm(n)
x5 <- x1 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x1
x6 <- x2 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x2
x7 <- 3*x1 + 2*x2 + rnorm(n, sd = 0.1) # Creating a variable highly correlated with x1 and x2
x8 <- 3*x3 + 2*x4 + rnorm(n, sd = 0.1) # Creating another variable highly correlated with x3 and x4
x9 <- rnorm(n)
x10 <- rnorm(n)
# Step 3: Create an outcome variable as a combination of the predictors
outcome <- 2 + 3*x1 - 1.5*x2 + 0.5*x3 + x7 + x8 + x9 + x10 + rnorm(n)
# Step 4: Combine all variables into a data frame
data <- tibble::tibble(outcome, x3, x4, x5, x6, x7, x8, x9, x10)
data_split <- initial_split(data, prop = .8)
training_data <- training(data_split)
test_data <- testing(data_split)
glmnet_recipe <-
recipe(formula = outcome ~ x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10, data = training_data) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
glmnet_spec <-
linear_reg(penalty = tune(), mixture = tune()) %>%
set_mode("regression") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(glmnet_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0.05,
0.2, 0.4, 0.6, 0.8, 1))
folds <- vfold_cv(training_data, v = 5)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = folds, grid = glmnet_grid)
show_best(glmnet_tune, metric = "rmse")
glmnet_model <- finalize_workflow(
glmnet_workflow,
select_best(glmnet_tune, metric = "rmse")
) |>
fit(data = training_data)
lm_spec <-
linear_reg() %>%
set_mode("regression")
lm_workflow <-
workflow() %>%
add_recipe(glmnet_recipe) %>%
add_model(lm_spec)
fit_resamples(lm_workflow, folds) |>
show_best()
training_lm <- augment(lm_fit, new_data = training_data) |>
rmse(truth = outcome, estimate = .pred) |>
mutate(model = "lm", data = "training")
training_glmnet <- augment(glmnet_model, new_data = training_data) |>
rmse(truth = outcome, estimate = .pred) |>
mutate(model = "glmnet", data = "training")
# test data
test_lm <- augment(lm_fit, new_data = test_data) |>
rmse(truth = outcome, estimate = .pred) |>
mutate(model = "lm", data = "test")
test_glmnet <- augment(glmnet_model, new_data = test_data) |>
rmse(truth = outcome, estimate = .pred) |>
mutate(model = "glmnet", data = "test")
bind_rows(training_lm, training_glmnet, test_lm, test_glmnet) |>
select(model, data, rmse = .estimate) |>
pivot_wider(names_from = model, values_from = rmse) |>
mutate(diff = glmnet - lm)
}
sim <- map_dfr(1:1000, ~ fit_models(), .id = "id")
sim |>
ggplot(aes(x = diff, fill = data)) +
facet_wrap(~ data) +
geom_density(color = NA)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment