Created
September 10, 2023 21:21
-
-
Save malcolmbarrett/6156875e781206bfdfe7147414d5016f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#set.seed(123) # Set seed for reproducibility | |
library(tidymodels) | |
fit_models <- function(n = 100) { | |
# Step 2: Create 10 predictors, some of which are noisy or highly correlated | |
x1 <- rnorm(n) | |
x2 <- rnorm(n) | |
x3 <- rnorm(n) | |
x4 <- rnorm(n) | |
x5 <- x1 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x1 | |
x6 <- x2 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x2 | |
x7 <- 3*x1 + 2*x2 + rnorm(n, sd = 0.1) # Creating a variable highly correlated with x1 and x2 | |
x8 <- 3*x3 + 2*x4 + rnorm(n, sd = 0.1) # Creating another variable highly correlated with x3 and x4 | |
x9 <- rnorm(n) | |
x10 <- rnorm(n) | |
# Step 3: Create an outcome variable as a combination of the predictors | |
outcome <- 2 + 3*x1 - 1.5*x2 + 0.5*x3 + x7 + x8 + x9 + x10 + rnorm(n) | |
# Step 4: Combine all variables into a data frame | |
data <- tibble::tibble(outcome, x3, x4, x5, x6, x7, x8, x9, x10) | |
data_split <- initial_split(data, prop = .8) | |
training_data <- training(data_split) | |
test_data <- testing(data_split) | |
glmnet_recipe <- | |
recipe(formula = outcome ~ x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10, data = training_data) %>% | |
step_zv(all_predictors()) %>% | |
step_normalize(all_numeric_predictors()) | |
glmnet_spec <- | |
linear_reg(penalty = tune(), mixture = tune()) %>% | |
set_mode("regression") %>% | |
set_engine("glmnet") | |
glmnet_workflow <- | |
workflow() %>% | |
add_recipe(glmnet_recipe) %>% | |
add_model(glmnet_spec) | |
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0.05, | |
0.2, 0.4, 0.6, 0.8, 1)) | |
folds <- vfold_cv(training_data, v = 5) | |
glmnet_tune <- | |
tune_grid(glmnet_workflow, resamples = folds, grid = glmnet_grid) | |
show_best(glmnet_tune, metric = "rmse") | |
glmnet_model <- finalize_workflow( | |
glmnet_workflow, | |
select_best(glmnet_tune, metric = "rmse") | |
) |> | |
fit(data = training_data) | |
lm_spec <- | |
linear_reg() %>% | |
set_mode("regression") | |
lm_workflow <- | |
workflow() %>% | |
add_recipe(glmnet_recipe) %>% | |
add_model(lm_spec) | |
fit_resamples(lm_workflow, folds) |> | |
show_best() | |
training_lm <- augment(lm_fit, new_data = training_data) |> | |
rmse(truth = outcome, estimate = .pred) |> | |
mutate(model = "lm", data = "training") | |
training_glmnet <- augment(glmnet_model, new_data = training_data) |> | |
rmse(truth = outcome, estimate = .pred) |> | |
mutate(model = "glmnet", data = "training") | |
# test data | |
test_lm <- augment(lm_fit, new_data = test_data) |> | |
rmse(truth = outcome, estimate = .pred) |> | |
mutate(model = "lm", data = "test") | |
test_glmnet <- augment(glmnet_model, new_data = test_data) |> | |
rmse(truth = outcome, estimate = .pred) |> | |
mutate(model = "glmnet", data = "test") | |
bind_rows(training_lm, training_glmnet, test_lm, test_glmnet) |> | |
select(model, data, rmse = .estimate) |> | |
pivot_wider(names_from = model, values_from = rmse) |> | |
mutate(diff = glmnet - lm) | |
} | |
sim <- map_dfr(1:1000, ~ fit_models(), .id = "id") | |
sim |> | |
ggplot(aes(x = diff, fill = data)) + | |
facet_wrap(~ data) + | |
geom_density(color = NA) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment