Created
April 11, 2025 19:29
-
-
Save battenr/e2f88cd4b15ae0ffaa4a28a19fdaf6c4 to your computer and use it in GitHub Desktop.
IPTW vs MSM for Time-Varying Confounding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Title: Time-Varying Confounding (MSM vs IPTW) | |
| # Description: Demonstrating how marginal structural models provide | |
| # less biased estimates compared to using IPTW at baseline only, when | |
| # there is time-varying confounding. | |
| # Setup ---- | |
| #... Libraries ---- | |
| library(tidyverse) # ol faithful | |
| library(WeightIt) # for creating weights | |
| library(geepack) # for using GEEs | |
| library(broom) # for cleaning outputs from models | |
| # Function for Simulating Data ---- | |
| # Note for transparency: | |
| # I wrote code for baseline, and used ChatGPT to help assist with: | |
| # - adding multiple timepoints | |
| # - turning this into a function | |
| simulate_data_and_model <- function(n = 500, Tmax = 5) { | |
| # Tmax is the total number of timepoints. For simplicity, | |
| # included that each individual had the same number of timepoints. | |
| # Initialize a list. This will be used to add the simulated data | |
| data_list <- list() | |
| # Simulate data for each individual | |
| # This for loop is for each individual. The code works by running through each timepoint | |
| # one individual at a time. Then it moves on to the next individual. | |
| for (i in 1:n) { | |
| # Baseline values (at time 0) | |
| # The rationale for using the term _prev, is the "current" timepoint will become the | |
| # "previous" for the next timepoint. The term baseline only applies to the first | |
| # measurement. | |
| sleep_prev <- runif(n = 1, min = 2, max = 10) | |
| weightlifting_prev <- rbinom(n = 1, size = 1, prob = 0.4) | |
| coffee_prev <- rbinom(n = 1, size = 1, prob = plogis(0.5 * weightlifting_prev - 0.1 * sleep_prev)) | |
| happiness_prev <- 1.5 * coffee_prev + 0.5 * sleep_prev + 2 * weightlifting_prev + rnorm(n = 1, mean = 0, sd = 1) | |
| # Storing this data, since this will be the baseline values. | |
| data_list[[length(data_list) + 1]] <- data.frame( | |
| id = i, | |
| time = 0, | |
| sleep = sleep_prev, | |
| weightlifting = weightlifting_prev, | |
| coffee = coffee_prev, | |
| happiness = happiness_prev | |
| ) | |
| # Loop through time points (from ) | |
| for (t in 1:Tmax) { | |
| # Simulate time-varying confounders and exposure | |
| sleep_curr <- runif(n = 1, min = 2, max = 10) + 1.5 * weightlifting_prev - 0.5 * coffee_prev | |
| weightlifting_curr <- rbinom(n = 1, size = 1, prob = plogis(0.4 * sleep_prev + 0.3 * coffee_prev)) | |
| coffee_curr <- rbinom(n = 1, size = 1, prob = plogis(0.5 * weightlifting_curr - 0.1 * sleep_curr)) | |
| happiness_curr <- 1.5 * coffee_curr + 0.5 * sleep_curr + 2 * weightlifting_curr + rnorm(n = 1, mean = 0, sd = 1) | |
| # Store the simulated data for this individual at time t | |
| data_list[[length(data_list) + 1]] <- data.frame( | |
| id = i, | |
| time = t, | |
| sleep = sleep_curr, | |
| weightlifting = weightlifting_curr, | |
| coffee = coffee_curr, | |
| happiness = happiness_curr | |
| ) | |
| # Update previous values for the next time point | |
| sleep_prev <- sleep_curr | |
| weightlifting_prev <- weightlifting_curr | |
| coffee_prev <- coffee_curr | |
| happiness_prev <- happiness_curr | |
| } | |
| } | |
| # Combine into a long-format data frame | |
| df <- do.call(rbind, data_list) | |
| # Weighting at baseline. Here, using IPTW (through the WeightIt package to create weights). | |
| # This is done assuming that the confounders don't change over time (which we know is wrong) | |
| wghts_bl <- WeightIt::weightit( | |
| coffee ~ sleep + weightlifting, | |
| data = df %>% filter(time == 0), | |
| stabilize = TRUE, # want stabilized weights | |
| estimand = "ATE" | |
| ) | |
| # Creating Dataframe with IDs and weights | |
| wghts_bl <- data.frame( | |
| id = unique(df$id), | |
| wght = wghts_bl$weights | |
| ) | |
| # Adding weights the the dataframe | |
| df_ipw <- df %>% | |
| left_join(wghts_bl, by = "id") | |
| # Fit the model with IPW at baseline | |
| mod_ipw <- geeglm(happiness ~ coffee, | |
| data = df_ipw, | |
| id = id, | |
| weights = wght) | |
| # MSM Weights. | |
| # These are similar to IPTW, however the weights change over time | |
| msm_weights <- WeightIt::weightit( | |
| coffee ~ sleep + weightlifting, | |
| data = df, | |
| stabilize = TRUE | |
| ) | |
| wghts_msm <- data.frame( | |
| id = df$id, | |
| time = df$time, | |
| wght = msm_weights$weights | |
| ) | |
| df_msm <- df %>% | |
| left_join(wghts_msm, by = c("id", "time")) | |
| # Fit the model with MSM weights | |
| mod_msm <- geeglm(happiness ~ coffee, | |
| data = df_msm, | |
| id = id, | |
| weights = wght) | |
| # Return the estimates. Later these will be used to calculate bias and Monte Carlo SE of bias. | |
| results = data.frame( | |
| mod_ipw = broom::tidy(mod_ipw) %>% filter(term == "coffee") %>% select(estimate) %>% rename(ipw_estimate = estimate), | |
| mod_msm = broom::tidy(mod_msm) %>% filter(term == "coffee") %>% select(estimate) %>% rename(msm_estimate = estimate) | |
| ) | |
| return(results) | |
| } | |
| # Repeat, Repeat, Repeat Again! ---- | |
| # Here we are repeating 1000 times. A couple of notes: | |
| # - 1000 is arbitrary. In reality, there are ways to determine the number of repeitions required | |
| # - This code may take a while to run. If interested in a smaller example, try changing the | |
| # number of reptitions to a smaller number. | |
| set.seed(456) # setting seed for reproducibility | |
| output_list <- replicate(1000, # this is number of repeitions (make smaller if you want quicker run time) | |
| simulate_data_and_model(), | |
| simplify = FALSE) | |
| # Calculating Metrics ---- | |
| # Using bias for metric. | |
| # These formulas are from Morris et al. 2019. | |
| df.out <- do.call(rbind, output_list) %>% | |
| mutate( | |
| bias_ipw = 1.5 - ipw_estimate, # it's 1.5 because that's what we defined the treatment effect as | |
| bias_msm = 1.5 - msm_estimate, | |
| squared_ipw = (bias_ipw - mean(bias_ipw)) ^2, # will be used later | |
| squared_msm = (bias_msm - mean(bias_msm)) ^2 # will be used later | |
| ) | |
| #... Calculating Bias ---- | |
| mean(df.out$bias_ipw) | |
| mean(df.out$bias_msm) | |
| #... Monte Carlo SE of Estimate ---- | |
| sqrt(sum(df.out$squared_ipw)*(1 / (1000*999))) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1) | |
| sqrt(sum(df.out$squared_msm)*(1 / (1000*999))) # Monte Carlo SE of bias (1000 is number of simulations, 999 is n - 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment