battenr · November 25, 2025 18:08
diff --git a/modelling_confounding.R b/modelling_confounding.R
 # Title: Modelling Confounding

 # Description: Demonstrating how it's important to consider the relationship between
 # variables. Directed acyclic graphs (DAGs) are helpful to identify what to 
 # adjust for, and what not to. However, we need to also consider the relationship
 # that we are trying to model. 

 # This code simulates data, where there is a nonlinear relationship between 
 # sleep (z1) and happiness (y). The relationship is plotted, and two models fit. 
 # One model assuming a linear relationship (incorrectly), one model assuming 
 # a nonlinear relationship

 # Setup ----

 #... Libraries ----

 library(tidyverse) # ol faithful
 library(splines) # for fitting splines
 library(broom) # for tidying outputs

 #... Functions ----

 # Description: Skeleton for Simulating Data 

 sim_data <- function(n = 250, # sample size 
                     beta_trt = 1.5, # treatment effect
                     # Parameters for Z1 
                     z1_mean = 5, z1_sd = 2, 
                     # Parameters for Z2
                     z2_size = 1, z2_prob = 0.5, 
                     # Confounder - Effect on X
                     z1_on_x = 0.5, z2_on_x = 0.2,
                     # Confounder - Effect on Y
                     z1_on_y = 0.2, z2_on_y = 0.3){
  
  # Creating the Dataframe
  
  df <- data.frame(
    z1 = rnorm(n = n, mean = z1_mean, sd = z1_sd), 
    z2 = rbinom(n = n, size = z2_size, prob = z2_prob)
  ) %>% 
    dplyr::mutate(
      prob = plogis(z1_on_x*z1 + z2_on_x*z2), 
      x = rbinom(n = n, size = 1, prob = prob), 
      y = beta_trt*x + z1_on_y*z1^(2.5) + z2_on_y*z2 + rnorm(n = n, mean = 0, sd = 1)
      # Note: nonlinear relationship between z1 & y
    )
  
  # Return the dataframe
  
  return(df)
 }

 # Simulating Data ----

 df <- sim_data()

 # Plots ----

 # Plotting z1 vs y to explore this relationship

 ggplot(data = df, 
       mapping = aes(x = z1, y = y)) + 
  geom_point(color = "purple", size = 2) +
  theme_minimal() +
  labs(x = "Hours of Sleep", 
       y = "Happiness") + 
  ggtitle("Hours of Sleep vs Happiness") + 
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 26),
    plot.subtitle = element_text(hjust = 0.5, size = 24),
    text = element_text(size = 24)
  ) +
  lims(x = c(0, 10), y = c(0, 80))

 # Fitting Models ----

 #... Model 1 ----

 # Only linear terms

 mod1 <- glm(y ~ x + z1 + z2, 
            family = gaussian(),
            data = df)

 # Cleaning model output

 mod1 %>% 
  broom::tidy(conf.int = TRUE)

 #... Model 2 ----

 # Using a spline for z1

 mod2 <- glm(y ~ x + splines::ns(z1, 4) + z2, 
            family = gaussian(),
            data = df)

 # Cleaning model output

 mod2 %>% 
  broom::tidy(conf.int = TRUE)

 # Simulation ----

 # Simulating data and fitting models 1000 times. Each time, calculating the bias. 
 # The bias and Monte Carlo SE of bias for each scenario is calculated then compared. 

 # Note: 1000 replications was chosen arbitrarily

 #... Model 1 ----

 # Function for simulating data and fitting model that assumes linear relationship
 # between z1 & y

 linear_z1 <- function(sample.size){
  
  trt_effect <- 1.5 
  
  df <- sim_data() # simulating data
  
  # fitting model
  
  mod <- glm(y ~ x + z1 + z2, 
              family = gaussian(),
              data = df)
  
  # estimated effect
  
  expected_value <- broom::tidy(mod)$estimate[2]
  
  # calculating bias 
  
  bias_for_one = expected_value - trt_effect
  
  # creating dataframe
  
  df_bias = data.frame(
    bias_for_one
  )
  
  return(df_bias) # returning dataframe
  
 }

 # Repeating 1000 times using a sample size of 250

 output_list <- replicate(1000, linear_z1(sample.size = 250), simplify = FALSE)

 df.out <- do.call(rbind, output_list) %>% # reformatting
  
  # Adding a new column that will be used to estimate the Monte Carlo SE of the estimate. 
  
  mutate(
    squared = (bias_for_one - mean(bias_for_one))^2
  )

 # Calculating the mean bias and Monte Carlo SE of estimate

 # See Morris et al. (2019) for details on calculating these

 bias.linear <- mean(df.out$bias_for_one) %>% round(5) # bias  
 bias.se.linear <- sqrt(sum(df.out$squared)*(1 / (1000*999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1)


 #... Model 2 ----

 # Function for simulating data and fitting model that assumes nonlinear relationship
 # between z1 & y

 nonlinear_z1 <- function(sample.size){
  
  trt_effect <- 1.5 
  
  df <- sim_data() 
  
  mod <- glm(y ~ x + splines::ns(z1, 4) + z2, 
             family = gaussian(),
             data = df)
  
  expected_value <- broom::tidy(mod)$estimate[2]
  
  bias_for_one = expected_value - trt_effect
  
  df_bias = data.frame(
    bias_for_one
  )
  
  return(df_bias)
  
 }

 # Repeating 1000 times using a sample size of 250

 output_list <- replicate(1000, nonlinear_z1(sample.size = 250), simplify = FALSE)

 df.out <- do.call(rbind, output_list) %>% # reformatting
  
  # Adding a new column that will be used to estimate the Monte Carlo SE of the estimate. 
  
  mutate(
    squared = (bias_for_one - mean(bias_for_one))^2
  )

 # Calculating the mean bias and Monte Carlo SE of estimate

 # See Morris et al. (2019) for details on calculating these

 bias.nonlinear <- mean(df.out$bias_for_one) %>% round(5) # bias  
 bias.se.nonlinear <- sqrt(sum(df.out$squared)*(1 / (1000*999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1)

 # Combining Results ----

 # Reformatting into a dataframe for easier comparison

 data.frame(
  model = c("Linear", "Non-Linear"),
  bias = c(bias.linear, bias.nonlinear), 
  sebias = c(bias.se.linear, bias.se.nonlinear)
 )
	# Title: Modelling Confounding

	# Description: Demonstrating how it's important to consider the relationship between
	# variables. Directed acyclic graphs (DAGs) are helpful to identify what to
	# adjust for, and what not to. However, we need to also consider the relationship
	# that we are trying to model.

	# This code simulates data, where there is a nonlinear relationship between
	# sleep (z1) and happiness (y). The relationship is plotted, and two models fit.
	# One model assuming a linear relationship (incorrectly), one model assuming
	# a nonlinear relationship

	# Setup ----

	#... Libraries ----

	library(tidyverse) # ol faithful
	library(splines) # for fitting splines
	library(broom) # for tidying outputs

	#... Functions ----

	# Description: Skeleton for Simulating Data

	sim_data <- function(n = 250, # sample size
	beta_trt = 1.5, # treatment effect
	# Parameters for Z1
	z1_mean = 5, z1_sd = 2,
	# Parameters for Z2
	z2_size = 1, z2_prob = 0.5,
	# Confounder - Effect on X
	z1_on_x = 0.5, z2_on_x = 0.2,
	# Confounder - Effect on Y
	z1_on_y = 0.2, z2_on_y = 0.3){

	# Creating the Dataframe

	df <- data.frame(
	z1 = rnorm(n = n, mean = z1_mean, sd = z1_sd),
	z2 = rbinom(n = n, size = z2_size, prob = z2_prob)
	) %>%
	dplyr::mutate(
	prob = plogis(z1_on_xz1 + z2_on_xz2),
	x = rbinom(n = n, size = 1, prob = prob),
	y = beta_trtx + z1_on_yz1^(2.5) + z2_on_y*z2 + rnorm(n = n, mean = 0, sd = 1)
	# Note: nonlinear relationship between z1 & y
	)

	# Return the dataframe

	return(df)
	}

	# Simulating Data ----

	df <- sim_data()

	# Plots ----

	# Plotting z1 vs y to explore this relationship

	ggplot(data = df,
	mapping = aes(x = z1, y = y)) +
	geom_point(color = "purple", size = 2) +
	theme_minimal() +
	labs(x = "Hours of Sleep",
	y = "Happiness") +
	ggtitle("Hours of Sleep vs Happiness") +
	theme(
	plot.title = element_text(hjust = 0.5, face = "bold", size = 26),
	plot.subtitle = element_text(hjust = 0.5, size = 24),
	text = element_text(size = 24)
	) +
	lims(x = c(0, 10), y = c(0, 80))

	# Fitting Models ----

	#... Model 1 ----

	# Only linear terms

	mod1 <- glm(y ~ x + z1 + z2,
	family = gaussian(),
	data = df)

	# Cleaning model output

	mod1 %>%
	broom::tidy(conf.int = TRUE)

	#... Model 2 ----

	# Using a spline for z1

	mod2 <- glm(y ~ x + splines::ns(z1, 4) + z2,
	family = gaussian(),
	data = df)

	# Cleaning model output

	mod2 %>%
	broom::tidy(conf.int = TRUE)

	# Simulation ----

	# Simulating data and fitting models 1000 times. Each time, calculating the bias.
	# The bias and Monte Carlo SE of bias for each scenario is calculated then compared.

	# Note: 1000 replications was chosen arbitrarily

	#... Model 1 ----

	# Function for simulating data and fitting model that assumes linear relationship
	# between z1 & y

	linear_z1 <- function(sample.size){

	trt_effect <- 1.5

	df <- sim_data() # simulating data

	# fitting model

	mod <- glm(y ~ x + z1 + z2,
	family = gaussian(),
	data = df)

	# estimated effect

	expected_value <- broom::tidy(mod)$estimate[2]

	# calculating bias

	bias_for_one = expected_value - trt_effect

	# creating dataframe

	df_bias = data.frame(
	bias_for_one
	)

	return(df_bias) # returning dataframe

	}

	# Repeating 1000 times using a sample size of 250

	output_list <- replicate(1000, linear_z1(sample.size = 250), simplify = FALSE)

	df.out <- do.call(rbind, output_list) %>% # reformatting

	# Adding a new column that will be used to estimate the Monte Carlo SE of the estimate.

	mutate(
	squared = (bias_for_one - mean(bias_for_one))^2
	)

	# Calculating the mean bias and Monte Carlo SE of estimate

	# See Morris et al. (2019) for details on calculating these

	bias.linear <- mean(df.out$bias_for_one) %>% round(5) # bias
	bias.se.linear <- sqrt(sum(df.out$squared)(1 / (1000999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1)


	#... Model 2 ----

	# Function for simulating data and fitting model that assumes nonlinear relationship
	# between z1 & y

	nonlinear_z1 <- function(sample.size){

	trt_effect <- 1.5

	df <- sim_data()

	mod <- glm(y ~ x + splines::ns(z1, 4) + z2,
	family = gaussian(),
	data = df)

	expected_value <- broom::tidy(mod)$estimate[2]

	bias_for_one = expected_value - trt_effect

	df_bias = data.frame(
	bias_for_one
	)

	return(df_bias)

	}

	# Repeating 1000 times using a sample size of 250

	output_list <- replicate(1000, nonlinear_z1(sample.size = 250), simplify = FALSE)

	df.out <- do.call(rbind, output_list) %>% # reformatting

	# Adding a new column that will be used to estimate the Monte Carlo SE of the estimate.

	mutate(
	squared = (bias_for_one - mean(bias_for_one))^2
	)

	# Calculating the mean bias and Monte Carlo SE of estimate

	# See Morris et al. (2019) for details on calculating these

	bias.nonlinear <- mean(df.out$bias_for_one) %>% round(5) # bias
	bias.se.nonlinear <- sqrt(sum(df.out$squared)(1 / (1000999))) %>% round(5) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1)

	# Combining Results ----

	# Reformatting into a dataframe for easier comparison

	data.frame(
	model = c("Linear", "Non-Linear"),
	bias = c(bias.linear, bias.nonlinear),
	sebias = c(bias.se.linear, bias.se.nonlinear)
	)
No results found