malcolmbarrett · September 10, 2023 21:21
diff --git a/causal_pred_sim.R b/causal_pred_sim.R
 #set.seed(123) # Set seed for reproducibility

 library(tidymodels)

 fit_models <- function(n = 100) {
  # Step 2: Create 10 predictors, some of which are noisy or highly correlated
  x1 <- rnorm(n)
  x2 <- rnorm(n)
  x3 <- rnorm(n)
  x4 <- rnorm(n)
  x5 <- x1 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x1
  x6 <- x2 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x2
  x7 <- 3*x1 + 2*x2 + rnorm(n, sd = 0.1) # Creating a variable highly correlated with x1 and x2
  x8 <- 3*x3 + 2*x4 + rnorm(n, sd = 0.1) # Creating another variable highly correlated with x3 and x4
  x9 <- rnorm(n)
  x10 <- rnorm(n)
  
  # Step 3: Create an outcome variable as a combination of the predictors
  outcome <- 2 + 3*x1 - 1.5*x2 + 0.5*x3 + x7 + x8 + x9 + x10 + rnorm(n)
  
  # Step 4: Combine all variables into a data frame
  data <- tibble::tibble(outcome, x3, x4, x5, x6, x7, x8, x9, x10)

  data_split <- initial_split(data, prop = .8)
  training_data <- training(data_split)
  test_data <- testing(data_split)
  
  glmnet_recipe <- 
    recipe(formula = outcome ~ x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10, data = training_data) %>% 
    step_zv(all_predictors()) %>% 
    step_normalize(all_numeric_predictors())
  
  glmnet_spec <- 
    linear_reg(penalty = tune(), mixture = tune()) %>% 
    set_mode("regression") %>% 
    set_engine("glmnet") 
  
  glmnet_workflow <- 
    workflow() %>% 
    add_recipe(glmnet_recipe) %>% 
    add_model(glmnet_spec) 
  
  glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0.05, 
                                                                                        0.2, 0.4, 0.6, 0.8, 1)) 
  folds <- vfold_cv(training_data, v = 5)
  glmnet_tune <- 
    tune_grid(glmnet_workflow, resamples = folds, grid = glmnet_grid) 
  
  show_best(glmnet_tune, metric = "rmse")
  
  glmnet_model <- finalize_workflow(
    glmnet_workflow, 
    select_best(glmnet_tune, metric = "rmse")
  ) |> 
    fit(data = training_data)
  
  lm_spec <- 
    linear_reg() %>% 
    set_mode("regression") 
  
  lm_workflow <- 
    workflow() %>% 
    add_recipe(glmnet_recipe) %>% 
    add_model(lm_spec) 
  
  fit_resamples(lm_workflow, folds) |> 
    show_best()
  
  training_lm <- augment(lm_fit, new_data = training_data) |> 
    rmse(truth = outcome, estimate = .pred) |> 
    mutate(model = "lm", data = "training")
  
  training_glmnet <- augment(glmnet_model, new_data = training_data) |> 
    rmse(truth = outcome, estimate = .pred) |> 
    mutate(model = "glmnet", data = "training")
  
  # test data
  
  test_lm <- augment(lm_fit, new_data = test_data) |> 
    rmse(truth = outcome, estimate = .pred) |> 
    mutate(model = "lm", data = "test")
  
  test_glmnet <- augment(glmnet_model, new_data = test_data) |> 
    rmse(truth = outcome, estimate = .pred) |> 
    mutate(model = "glmnet", data = "test")
  
  bind_rows(training_lm, training_glmnet, test_lm, test_glmnet) |> 
    select(model, data, rmse = .estimate) |> 
    pivot_wider(names_from = model, values_from = rmse) |> 
    mutate(diff = glmnet - lm)
 }

 sim <- map_dfr(1:1000, ~ fit_models(), .id = "id")

 sim |> 
  ggplot(aes(x = diff, fill = data)) + 
  facet_wrap(~ data) + 
  geom_density(color = NA)
	#set.seed(123) # Set seed for reproducibility

	library(tidymodels)

	fit_models <- function(n = 100) {
	# Step 2: Create 10 predictors, some of which are noisy or highly correlated
	x1 <- rnorm(n)
	x2 <- rnorm(n)
	x3 <- rnorm(n)
	x4 <- rnorm(n)
	x5 <- x1 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x1
	x6 <- x2 + rnorm(n, sd = 0.75) # Adding noise to create a noisy version of x2
	x7 <- 3x1 + 2x2 + rnorm(n, sd = 0.1) # Creating a variable highly correlated with x1 and x2
	x8 <- 3x3 + 2x4 + rnorm(n, sd = 0.1) # Creating another variable highly correlated with x3 and x4
	x9 <- rnorm(n)
	x10 <- rnorm(n)

	# Step 3: Create an outcome variable as a combination of the predictors
	outcome <- 2 + 3x1 - 1.5x2 + 0.5*x3 + x7 + x8 + x9 + x10 + rnorm(n)

	# Step 4: Combine all variables into a data frame
	data <- tibble::tibble(outcome, x3, x4, x5, x6, x7, x8, x9, x10)

	data_split <- initial_split(data, prop = .8)
	training_data <- training(data_split)
	test_data <- testing(data_split)

	glmnet_recipe <-
	recipe(formula = outcome ~ x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10, data = training_data) %>%
	step_zv(all_predictors()) %>%
	step_normalize(all_numeric_predictors())

	glmnet_spec <-
	linear_reg(penalty = tune(), mixture = tune()) %>%
	set_mode("regression") %>%
	set_engine("glmnet")

	glmnet_workflow <-
	workflow() %>%
	add_recipe(glmnet_recipe) %>%
	add_model(glmnet_spec)

	glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0.05,
	0.2, 0.4, 0.6, 0.8, 1))
	folds <- vfold_cv(training_data, v = 5)
	glmnet_tune <-
	tune_grid(glmnet_workflow, resamples = folds, grid = glmnet_grid)

	show_best(glmnet_tune, metric = "rmse")

	glmnet_model <- finalize_workflow(
	glmnet_workflow,
	select_best(glmnet_tune, metric = "rmse")
	) \|>
	fit(data = training_data)

	lm_spec <-
	linear_reg() %>%
	set_mode("regression")

	lm_workflow <-
	workflow() %>%
	add_recipe(glmnet_recipe) %>%
	add_model(lm_spec)

	fit_resamples(lm_workflow, folds) \|>
	show_best()

	training_lm <- augment(lm_fit, new_data = training_data) \|>
	rmse(truth = outcome, estimate = .pred) \|>
	mutate(model = "lm", data = "training")

	training_glmnet <- augment(glmnet_model, new_data = training_data) \|>
	rmse(truth = outcome, estimate = .pred) \|>
	mutate(model = "glmnet", data = "training")

	# test data

	test_lm <- augment(lm_fit, new_data = test_data) \|>
	rmse(truth = outcome, estimate = .pred) \|>
	mutate(model = "lm", data = "test")

	test_glmnet <- augment(glmnet_model, new_data = test_data) \|>
	rmse(truth = outcome, estimate = .pred) \|>
	mutate(model = "glmnet", data = "test")

	bind_rows(training_lm, training_glmnet, test_lm, test_glmnet) \|>
	select(model, data, rmse = .estimate) \|>
	pivot_wider(names_from = model, values_from = rmse) \|>
	mutate(diff = glmnet - lm)
	}

	sim <- map_dfr(1:1000, ~ fit_models(), .id = "id")

	sim \|>
	ggplot(aes(x = diff, fill = data)) +
	facet_wrap(~ data) +
	geom_density(color = NA)