mingjiphd · January 27, 2026 18:57
diff --git a/Readme.txt b/Readme.txt
 This R Script is a step by step tutorial showing how to perform G estimation of causal effects of potentially time varying covariates from observational, longitudinal data.  G estimation is a part of the G Methods, developed by James Robins, for causal inference from longitudinal, observational data with potentially time varying covariates under weaker assumptions.   In this video, we show you how to implement G estimation by using GEE in R.  The implementation includes specifying an estimating function, a grid and a grid search to minimize the estimating function.  
 A step by step video demonstration can be found at: https://youtu.be/9n0YP8cGyuo?si=z1AeQsFOFsHCHzXN 
diff --git a/R Script b/R Script
 #### Causal Analysis using R  G Estimation using GEE

 ##  James Robins' G estimation is one of the g methods, a class of causal inference techniques 
 ##  including the g formula and marginal structural models, designed for complex longitudinal 
 ##  studies with time-varying confounders.

 #### It specifically focuses on structural nested models and uses the conditional independence 
 ##   between treatment and potential outcomes to consistently estimate causal effects 
 ##   under weaker assumptions than standard regression.

 #### Developed to address challenges in causal inference with time-varying treatments, 
 ##   G estimation is important in epidemiology and comparative effectiveness research.

 #### This method corrects bias from time-varying confounding and treatment noncompliance, 
 ##   enabling more accurate estimation of causal effects in complex treatment settings.


 #### Part I: Implementing G Estimation using GEE for Constant Covariates
 |
  ##Synthetic Data:           Simulates confounder, treatment, and outcome                                
  ##Propensity Score          Estimates probability of treatment given confounder                         
  ##GEE Fit for G-Estimation  Uses GEE to model treatment with outcome transformed by candidate parameter 
  ##Grid Search               Optimizes parameter to solve for the estimated causal effect       
               |
 ### Step 1: Synthetic Data Generation for G Estimation
 # Load necessary libraries
 library(geepack)

 # Step 1: Simulate data
 set.seed(11172025)

 n <- 1000
 L <- rbinom(n, 1, 0.5)          # Confounder
 A <- rbinom(n, 1, plogis(0.5 * L)) # Treatment, dependent on L
 # Potential outcome Y depends on A and L with additive effect
 Y <- 2 * A + L + rnorm(n)

 data <- data.frame(L, A, Y)
 head(data)

 # Step 2: Calculate predicted probability of treatment given confounder (propensity score)
 ps_model <- glm(A ~ L, family = binomial, data = data)
 data$ea <- predict(ps_model, type = "response")

 # Step 3: Define estimating function for G-estimation parameter (x) with grid search
 g_estimation_func <- function(x, data){
  Hx <- data$Y - x * data$A
  fit <- geeglm(A ~ L + Hx, family = binomial, data = data, id = 1:n, corstr = "independence")
  coef_Hx <- summary(fit)$coefficients["Hx", "Estimate"]
  return(abs(coef_Hx))  # minimize absolute value for zero estimating equation
 }

 # Step 4: Grid search over plausible values of x to find root
 grid <- seq(0, 3, length.out = 100)
 est_vals <- sapply(grid, g_estimation_func, data = data)
 g_estimate <- grid[which.min(est_vals)]

 cat("G-estimation estimate of causal effect:", g_estimate, "\n")


 ####  Part II:  G Estimation in the presence of Time Varying Covariates

 library(geepack)

 # Step 1: Simulate longitudinal data with time-varying confounders & treatment
 set.seed(11172025)
 n <- 500    # subjects
 T <- 4      # time points

 # Create an empty data.frame to hold panel data
 data_long <- data.frame(id = rep(1:n, each = T),
                        time = rep(1:T, n),
                        L = NA,
                        A = NA,
                        Y = NA)

 # Simulate time-varying confounder L, treatment A, and outcome Y
 for (i in 1:n) {
  for (t in 1:T) {
    # Generate L depending on previous A and L (time-varying confounder)
    if (t == 1) {
      data_long$L[(i-1)*T + t] <- rbinom(1, 1, 0.5)
    } else {
      prev_A <- data_long$A[(i-1)*T + t - 1]
      prev_L <- data_long$L[(i-1)*T + t - 1]
      data_long$L[(i-1)*T + t] <- rbinom(1, 1, plogis(0.3 * prev_L - 0.2 * prev_A))
    }
    
    # Generate treatment A depending on current L (and possibly past)
    data_long$A[(i-1)*T + t] <- rbinom(1, 1, plogis(0.5 * data_long$L[(i-1)*T + t]))
    
    # Outcome Y depends on current A and L with additive noise
    # Assuming outcome observed at each time point for demonstration
    data_long$Y[(i-1)*T + t] <- 2 * data_long$A[(i-1)*T + t] + 
      1 * data_long$L[(i-1)*T + t] + 
      rnorm(1)
  }
 }

 head(data_long)
 # Step 2: Estimate propensity scores at each time point: P(A_t | L_t)
 ps_model <- glm(A ~ L + factor(time), family = binomial, data = data_long)
 data_long$ea <- predict(ps_model, type = "response")

 # Step 3: Define estimating function for G-estimation with time varying covariates
 g_estimation_func <- function(x, data){
  # Adjust outcome by treatment effect parameter over all time points
  Hx <- data$Y - x * data$A
  # Fit GEE model for treatment, adjusting for time-varying confounder and transformed outcome
  fit <- geeglm(A ~ L + Hx + factor(time), family = binomial, id = data$id, corstr = "independence", data = data)
  coef_Hx <- summary(fit)$coefficients["Hx", "Estimate"]
  return(abs(coef_Hx))  # want to minimize absolute value to solve estimating equation
 }

 # Step 4: Grid search for causal parameter estimate
 grid <- seq(0, 3, length.out = 100)
 est_vals <- sapply(grid, g_estimation_func, data = data_long)
 g_estimate <- grid[which.min(est_vals)]

 cat("G-estimation estimate of causal effect with time-varying covariates:", g_estimate, "\n")

 ## Statistical Inference on the G estimation estimate of causal effect
 # Bootstrap to obtain confidence interval for G-estimate
 n_boot <- 200
 boot_estimates <- numeric(n_boot)
 set.seed(11172025)

 for (b in 1:n_boot) {
  sampled_ids <- sample(1:n, n, replace = TRUE)
  boot_data <- do.call(rbind, lapply(sampled_ids, function(i) data_long[data_long$id == i, ]))
  est_vals_boot <- sapply(grid, g_estimation_func, data = boot_data)
  boot_estimates[b] <- grid[which.min(est_vals_boot)]
 }

 # Calculate percentile bootstrap 95% confidence interval
 ci_lower <- quantile(boot_estimates, 0.025)
 ci_upper <- quantile(boot_estimates, 0.975)

 cat("95% Bootstrap CI for G-estimate: [", ci_lower, ", ", ci_upper, "]\n")
diff --git a/VideoDemo b/VideoDemo
 A step by step video demonstration can be found at: https://youtu.be/9n0YP8cGyuo?si=z1AeQsFOFsHCHzXN
	#### Causal Analysis using R G Estimation using GEE

	## James Robins' G estimation is one of the g methods, a class of causal inference techniques
	## including the g formula and marginal structural models, designed for complex longitudinal
	## studies with time-varying confounders.

	#### It specifically focuses on structural nested models and uses the conditional independence
	## between treatment and potential outcomes to consistently estimate causal effects
	## under weaker assumptions than standard regression.

	#### Developed to address challenges in causal inference with time-varying treatments,
	## G estimation is important in epidemiology and comparative effectiveness research.

	#### This method corrects bias from time-varying confounding and treatment noncompliance,
	## enabling more accurate estimation of causal effects in complex treatment settings.


	#### Part I: Implementing G Estimation using GEE for Constant Covariates
	\|
	##Synthetic Data: Simulates confounder, treatment, and outcome
	##Propensity Score Estimates probability of treatment given confounder
	##GEE Fit for G-Estimation Uses GEE to model treatment with outcome transformed by candidate parameter
	##Grid Search Optimizes parameter to solve for the estimated causal effect
	\|
	### Step 1: Synthetic Data Generation for G Estimation
	# Load necessary libraries
	library(geepack)

	# Step 1: Simulate data
	set.seed(11172025)

	n <- 1000
	L <- rbinom(n, 1, 0.5) # Confounder
	A <- rbinom(n, 1, plogis(0.5 * L)) # Treatment, dependent on L
	# Potential outcome Y depends on A and L with additive effect
	Y <- 2 * A + L + rnorm(n)

	data <- data.frame(L, A, Y)
	head(data)

	# Step 2: Calculate predicted probability of treatment given confounder (propensity score)
	ps_model <- glm(A ~ L, family = binomial, data = data)
	data$ea <- predict(ps_model, type = "response")

	# Step 3: Define estimating function for G-estimation parameter (x) with grid search
	g_estimation_func <- function(x, data){
	Hx <- data$Y - x * data$A
	fit <- geeglm(A ~ L + Hx, family = binomial, data = data, id = 1:n, corstr = "independence")
	coef_Hx <- summary(fit)$coefficients["Hx", "Estimate"]
	return(abs(coef_Hx)) # minimize absolute value for zero estimating equation
	}

	# Step 4: Grid search over plausible values of x to find root
	grid <- seq(0, 3, length.out = 100)
	est_vals <- sapply(grid, g_estimation_func, data = data)
	g_estimate <- grid[which.min(est_vals)]

	cat("G-estimation estimate of causal effect:", g_estimate, "\n")


	#### Part II: G Estimation in the presence of Time Varying Covariates

	library(geepack)

	# Step 1: Simulate longitudinal data with time-varying confounders & treatment
	set.seed(11172025)
	n <- 500 # subjects
	T <- 4 # time points

	# Create an empty data.frame to hold panel data
	data_long <- data.frame(id = rep(1:n, each = T),
	time = rep(1:T, n),
	L = NA,
	A = NA,
	Y = NA)

	# Simulate time-varying confounder L, treatment A, and outcome Y
	for (i in 1:n) {
	for (t in 1:T) {
	# Generate L depending on previous A and L (time-varying confounder)
	if (t == 1) {
	data_long$L[(i-1)*T + t] <- rbinom(1, 1, 0.5)
	} else {
	prev_A <- data_long$A[(i-1)*T + t - 1]
	prev_L <- data_long$L[(i-1)*T + t - 1]
	data_long$L[(i-1)T + t] <- rbinom(1, 1, plogis(0.3 prev_L - 0.2 * prev_A))
	}

	# Generate treatment A depending on current L (and possibly past)
	data_long$A[(i-1)T + t] <- rbinom(1, 1, plogis(0.5 data_long$L[(i-1)*T + t]))

	# Outcome Y depends on current A and L with additive noise
	# Assuming outcome observed at each time point for demonstration
	data_long$Y[(i-1)T + t] <- 2 data_long$A[(i-1)*T + t] +
	1 * data_long$L[(i-1)*T + t] +
	rnorm(1)
	}
	}

	head(data_long)
	# Step 2: Estimate propensity scores at each time point: P(A_t \| L_t)
	ps_model <- glm(A ~ L + factor(time), family = binomial, data = data_long)
	data_long$ea <- predict(ps_model, type = "response")

	# Step 3: Define estimating function for G-estimation with time varying covariates
	g_estimation_func <- function(x, data){
	# Adjust outcome by treatment effect parameter over all time points
	Hx <- data$Y - x * data$A
	# Fit GEE model for treatment, adjusting for time-varying confounder and transformed outcome
	fit <- geeglm(A ~ L + Hx + factor(time), family = binomial, id = data$id, corstr = "independence", data = data)
	coef_Hx <- summary(fit)$coefficients["Hx", "Estimate"]
	return(abs(coef_Hx)) # want to minimize absolute value to solve estimating equation
	}

	# Step 4: Grid search for causal parameter estimate
	grid <- seq(0, 3, length.out = 100)
	est_vals <- sapply(grid, g_estimation_func, data = data_long)
	g_estimate <- grid[which.min(est_vals)]

	cat("G-estimation estimate of causal effect with time-varying covariates:", g_estimate, "\n")

	## Statistical Inference on the G estimation estimate of causal effect
	# Bootstrap to obtain confidence interval for G-estimate
	n_boot <- 200
	boot_estimates <- numeric(n_boot)
	set.seed(11172025)

	for (b in 1:n_boot) {
	sampled_ids <- sample(1:n, n, replace = TRUE)
	boot_data <- do.call(rbind, lapply(sampled_ids, function(i) data_long[data_long$id == i, ]))
	est_vals_boot <- sapply(grid, g_estimation_func, data = boot_data)
	boot_estimates[b] <- grid[which.min(est_vals_boot)]
	}

	# Calculate percentile bootstrap 95% confidence interval
	ci_lower <- quantile(boot_estimates, 0.025)
	ci_upper <- quantile(boot_estimates, 0.975)

	cat("95% Bootstrap CI for G-estimate: [", ci_lower, ", ", ci_upper, "]\n")
No results found