Skip to content

Instantly share code, notes, and snippets.

@battenr
Created November 25, 2025 18:08
Show Gist options
  • Select an option

  • Save battenr/c6668f3900d8479162a2a8e942006e52 to your computer and use it in GitHub Desktop.

Select an option

Save battenr/c6668f3900d8479162a2a8e942006e52 to your computer and use it in GitHub Desktop.
Modelling Confounding
# Title: Modelling Confounding
# Description: Demonstrating how it's important to consider the relationship between
# variables. Directed acyclic graphs (DAGs) are helpful to identify what to
# adjust for, and what not to. However, we need to also consider the relationship
# that we are trying to model.
# This code simulates data, where there is a nonlinear relationship between
# sleep (z1) and happiness (y). The relationship is plotted, and two models fit.
# One model assuming a linear relationship (incorrectly), one model assuming
# a nonlinear relationship
# Setup ----
#... Libraries ----
library(tidyverse) # ol faithful
library(splines) # for fitting splines
library(broom) # for tidying outputs
#... Functions ----
# Description: Skeleton for Simulating Data
sim_data <- function(n = 250, # sample size
beta_trt = 1.5, # treatment effect
# Parameters for Z1
z1_mean = 5, z1_sd = 2,
# Parameters for Z2
z2_size = 1, z2_prob = 0.5,
# Confounder - Effect on X
z1_on_x = 0.5, z2_on_x = 0.2,
# Confounder - Effect on Y
z1_on_y = 0.2, z2_on_y = 0.3){
# Creating the Dataframe
df <- data.frame(
z1 = rnorm(n = n, mean = z1_mean, sd = z1_sd),
z2 = rbinom(n = n, size = z2_size, prob = z2_prob)
) %>%
dplyr::mutate(
prob = plogis(z1_on_x*z1 + z2_on_x*z2),
x = rbinom(n = n, size = 1, prob = prob),
y = beta_trt*x + z1_on_y*z1^(2.5) + z2_on_y*z2 + rnorm(n = n, mean = 0, sd = 1)
# Note: nonlinear relationship between z1 & y
)
# Return the dataframe
return(df)
}
# Simulating Data ----
df <- sim_data()
# Plots ----
# Plotting z1 vs y to explore this relationship
ggplot(data = df,
mapping = aes(x = z1, y = y)) +
geom_point(color = "purple", size = 2) +
theme_minimal() +
labs(x = "Hours of Sleep",
y = "Happiness") +
ggtitle("Hours of Sleep vs Happiness") +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 26),
plot.subtitle = element_text(hjust = 0.5, size = 24),
text = element_text(size = 24)
) +
lims(x = c(0, 10), y = c(0, 80))
# Fitting Models ----
#... Model 1 ----
# Only linear terms
mod1 <- glm(y ~ x + z1 + z2,
family = gaussian(),
data = df)
# Cleaning model output
mod1 %>%
broom::tidy(conf.int = TRUE)
#... Model 2 ----
# Using a spline for z1
mod2 <- glm(y ~ x + splines::ns(z1, 4) + z2,
family = gaussian(),
data = df)
# Cleaning model output
mod2 %>%
broom::tidy(conf.int = TRUE)
# Simulation ----
# Simulating data and fitting models 1000 times. Each time, calculating the bias.
# The bias and Monte Carlo SE of bias for each scenario is calculated then compared.
# Note: 1000 replications was chosen arbitrarily
#... Model 1 ----
# Function for simulating data and fitting model that assumes linear relationship
# between z1 & y
linear_z1 <- function(sample.size){
trt_effect <- 1.5
df <- sim_data() # simulating data
# fitting model
mod <- glm(y ~ x + z1 + z2,
family = gaussian(),
data = df)
# estimated effect
expected_value <- broom::tidy(mod)$estimate[2]
# calculating bias
bias_for_one = expected_value - trt_effect
# creating dataframe
df_bias = data.frame(
bias_for_one
)
return(df_bias) # returning dataframe
}
# Repeating 1000 times using a sample size of 250
output_list <- replicate(1000, linear_z1(sample.size = 250), simplify = FALSE)
df.out <- do.call(rbind, output_list) %>% # reformatting
# Adding a new column that will be used to estimate the Monte Carlo SE of the estimate.
mutate(
squared = (bias_for_one - mean(bias_for_one))^2
)
# Calculating the mean bias and Monte Carlo SE of estimate
# See Morris et al. (2019) for details on calculating these
bias.linear <- mean(df.out$bias_for_one) %>% round(5) # bias
bias.se.linear <- sqrt(sum(df.out$squared)*(1 / (1000*999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1)
#... Model 2 ----
# Function for simulating data and fitting model that assumes nonlinear relationship
# between z1 & y
nonlinear_z1 <- function(sample.size){
trt_effect <- 1.5
df <- sim_data()
mod <- glm(y ~ x + splines::ns(z1, 4) + z2,
family = gaussian(),
data = df)
expected_value <- broom::tidy(mod)$estimate[2]
bias_for_one = expected_value - trt_effect
df_bias = data.frame(
bias_for_one
)
return(df_bias)
}
# Repeating 1000 times using a sample size of 250
output_list <- replicate(1000, nonlinear_z1(sample.size = 250), simplify = FALSE)
df.out <- do.call(rbind, output_list) %>% # reformatting
# Adding a new column that will be used to estimate the Monte Carlo SE of the estimate.
mutate(
squared = (bias_for_one - mean(bias_for_one))^2
)
# Calculating the mean bias and Monte Carlo SE of estimate
# See Morris et al. (2019) for details on calculating these
bias.nonlinear <- mean(df.out$bias_for_one) %>% round(5) # bias
bias.se.nonlinear <- sqrt(sum(df.out$squared)*(1 / (1000*999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1)
# Combining Results ----
# Reformatting into a dataframe for easier comparison
data.frame(
model = c("Linear", "Non-Linear"),
bias = c(bias.linear, bias.nonlinear),
sebias = c(bias.se.linear, bias.se.nonlinear)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment