Created
November 25, 2025 18:08
-
-
Save battenr/c6668f3900d8479162a2a8e942006e52 to your computer and use it in GitHub Desktop.
Modelling Confounding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Title: Modelling Confounding | |
| # Description: Demonstrating how it's important to consider the relationship between | |
| # variables. Directed acyclic graphs (DAGs) are helpful to identify what to | |
| # adjust for, and what not to. However, we need to also consider the relationship | |
| # that we are trying to model. | |
| # This code simulates data, where there is a nonlinear relationship between | |
| # sleep (z1) and happiness (y). The relationship is plotted, and two models fit. | |
| # One model assuming a linear relationship (incorrectly), one model assuming | |
| # a nonlinear relationship | |
| # Setup ---- | |
| #... Libraries ---- | |
| library(tidyverse) # ol faithful | |
| library(splines) # for fitting splines | |
| library(broom) # for tidying outputs | |
| #... Functions ---- | |
| # Description: Skeleton for Simulating Data | |
| sim_data <- function(n = 250, # sample size | |
| beta_trt = 1.5, # treatment effect | |
| # Parameters for Z1 | |
| z1_mean = 5, z1_sd = 2, | |
| # Parameters for Z2 | |
| z2_size = 1, z2_prob = 0.5, | |
| # Confounder - Effect on X | |
| z1_on_x = 0.5, z2_on_x = 0.2, | |
| # Confounder - Effect on Y | |
| z1_on_y = 0.2, z2_on_y = 0.3){ | |
| # Creating the Dataframe | |
| df <- data.frame( | |
| z1 = rnorm(n = n, mean = z1_mean, sd = z1_sd), | |
| z2 = rbinom(n = n, size = z2_size, prob = z2_prob) | |
| ) %>% | |
| dplyr::mutate( | |
| prob = plogis(z1_on_x*z1 + z2_on_x*z2), | |
| x = rbinom(n = n, size = 1, prob = prob), | |
| y = beta_trt*x + z1_on_y*z1^(2.5) + z2_on_y*z2 + rnorm(n = n, mean = 0, sd = 1) | |
| # Note: nonlinear relationship between z1 & y | |
| ) | |
| # Return the dataframe | |
| return(df) | |
| } | |
| # Simulating Data ---- | |
| df <- sim_data() | |
| # Plots ---- | |
| # Plotting z1 vs y to explore this relationship | |
| ggplot(data = df, | |
| mapping = aes(x = z1, y = y)) + | |
| geom_point(color = "purple", size = 2) + | |
| theme_minimal() + | |
| labs(x = "Hours of Sleep", | |
| y = "Happiness") + | |
| ggtitle("Hours of Sleep vs Happiness") + | |
| theme( | |
| plot.title = element_text(hjust = 0.5, face = "bold", size = 26), | |
| plot.subtitle = element_text(hjust = 0.5, size = 24), | |
| text = element_text(size = 24) | |
| ) + | |
| lims(x = c(0, 10), y = c(0, 80)) | |
| # Fitting Models ---- | |
| #... Model 1 ---- | |
| # Only linear terms | |
| mod1 <- glm(y ~ x + z1 + z2, | |
| family = gaussian(), | |
| data = df) | |
| # Cleaning model output | |
| mod1 %>% | |
| broom::tidy(conf.int = TRUE) | |
| #... Model 2 ---- | |
| # Using a spline for z1 | |
| mod2 <- glm(y ~ x + splines::ns(z1, 4) + z2, | |
| family = gaussian(), | |
| data = df) | |
| # Cleaning model output | |
| mod2 %>% | |
| broom::tidy(conf.int = TRUE) | |
| # Simulation ---- | |
| # Simulating data and fitting models 1000 times. Each time, calculating the bias. | |
| # The bias and Monte Carlo SE of bias for each scenario is calculated then compared. | |
| # Note: 1000 replications was chosen arbitrarily | |
| #... Model 1 ---- | |
| # Function for simulating data and fitting model that assumes linear relationship | |
| # between z1 & y | |
| linear_z1 <- function(sample.size){ | |
| trt_effect <- 1.5 | |
| df <- sim_data() # simulating data | |
| # fitting model | |
| mod <- glm(y ~ x + z1 + z2, | |
| family = gaussian(), | |
| data = df) | |
| # estimated effect | |
| expected_value <- broom::tidy(mod)$estimate[2] | |
| # calculating bias | |
| bias_for_one = expected_value - trt_effect | |
| # creating dataframe | |
| df_bias = data.frame( | |
| bias_for_one | |
| ) | |
| return(df_bias) # returning dataframe | |
| } | |
| # Repeating 1000 times using a sample size of 250 | |
| output_list <- replicate(1000, linear_z1(sample.size = 250), simplify = FALSE) | |
| df.out <- do.call(rbind, output_list) %>% # reformatting | |
| # Adding a new column that will be used to estimate the Monte Carlo SE of the estimate. | |
| mutate( | |
| squared = (bias_for_one - mean(bias_for_one))^2 | |
| ) | |
| # Calculating the mean bias and Monte Carlo SE of estimate | |
| # See Morris et al. (2019) for details on calculating these | |
| bias.linear <- mean(df.out$bias_for_one) %>% round(5) # bias | |
| bias.se.linear <- sqrt(sum(df.out$squared)*(1 / (1000*999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1) | |
| #... Model 2 ---- | |
| # Function for simulating data and fitting model that assumes nonlinear relationship | |
| # between z1 & y | |
| nonlinear_z1 <- function(sample.size){ | |
| trt_effect <- 1.5 | |
| df <- sim_data() | |
| mod <- glm(y ~ x + splines::ns(z1, 4) + z2, | |
| family = gaussian(), | |
| data = df) | |
| expected_value <- broom::tidy(mod)$estimate[2] | |
| bias_for_one = expected_value - trt_effect | |
| df_bias = data.frame( | |
| bias_for_one | |
| ) | |
| return(df_bias) | |
| } | |
| # Repeating 1000 times using a sample size of 250 | |
| output_list <- replicate(1000, nonlinear_z1(sample.size = 250), simplify = FALSE) | |
| df.out <- do.call(rbind, output_list) %>% # reformatting | |
| # Adding a new column that will be used to estimate the Monte Carlo SE of the estimate. | |
| mutate( | |
| squared = (bias_for_one - mean(bias_for_one))^2 | |
| ) | |
| # Calculating the mean bias and Monte Carlo SE of estimate | |
| # See Morris et al. (2019) for details on calculating these | |
| bias.nonlinear <- mean(df.out$bias_for_one) %>% round(5) # bias | |
| bias.se.nonlinear <- sqrt(sum(df.out$squared)*(1 / (1000*999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1) | |
| # Combining Results ---- | |
| # Reformatting into a dataframe for easier comparison | |
| data.frame( | |
| model = c("Linear", "Non-Linear"), | |
| bias = c(bias.linear, bias.nonlinear), | |
| sebias = c(bias.se.linear, bias.se.nonlinear) | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment