Skip to content

Instantly share code, notes, and snippets.

@battenr
Created July 25, 2024 19:22
Show Gist options
  • Select an option

  • Save battenr/7f01546afbbe878b8dc87e19e714f58f to your computer and use it in GitHub Desktop.

Select an option

Save battenr/7f01546afbbe878b8dc87e19e714f58f to your computer and use it in GitHub Desktop.
Parametric G-Formula versus IPTW
# Title: Parametric G-Formula vs IPTW
# Description: Comparing results from parametric g-formula to results from IPTW.
# These methods model different things. The parametric g-formula models the outcome
# while IPTW models the treatment. The goal of this code is to show how they differ,
# but are not drastically different (in this example with minimal model misspecification).
# Setup ----
library(tidyverse) # ol faithful
library(WeightIt) # for IPTW
library(boot) # for estimating SE using bootstrapping for standardization
library(tidymodels) # bootstrapping for IPTW
library(broom) # tidying the model results
# Simulating Data ----
set.seed(456) # setting seed for reproducibility
n = 500 # sample size. Arbitrarily chosen as 500. Note the sample size will
# impact the standard error.
# Simulating data
# - Two confounders: one binary, one continuous
# - Binary treatment
# - Continuous outcome
df <- data.frame(
c1 = rbinom(n = n, size = 1, prob = 0.45), # arbitrary probability of 0.45
c2 = rnorm(n = n, mean = 5, sd = 2) # continuous confounder
) %>%
dplyr::mutate(
x = rbinom(n = n, size = 1, prob = plogis(c1 + 0.1*c2)), # binary treatment
y = 3 + 0.5*c1 + 0.75*c2 + 1.5*x + rnorm(n = n) # continuous outcome
)
# Inverse Probability of Treatment Weighting (IPTW) ----
# Fitting the propensity score model
ps_mod <- WeightIt::weightit(x ~ c1 + c2, # including both confounders.
data = df,
method = "glm",
estimand = "ATE") # using the same estimand as the
# parametric g-formula (ATE)
# Outcome model
# This outcome model uses the weights from the PS model.
out_mod <- glm(y ~ x,
family = gaussian(link = "identity"), # continuous outcome
data = df,
weights = ps_mod$weights)
#... Bootstrapping ----
#..... Setup for Bootstrapping ----
# Function for calculating the propensity score model and then the outcome model.
# Personally, I fit the PS model again in each replicate (similar to replicate
# weights in surveys) but open to discussion on why/why not.
ps_glm <- function(split) {
df <- rsample::analysis(split) # need to do based on how the bootstraps are
# formatted from the bootstraps function
# Fitting PS model
ps_model <- WeightIt::weightit(x ~ c1 + c2,
data = df,
method = "ps")
# Fitting outcome model and tidying.
stats::glm(y ~ x,
data = df,
weights = ps_model$weights) %>%
tidy()
}
#..... Actual Bootstrapping ----
boot_results <- rsample::bootstraps(df, 1000, apparent = TRUE) %>% # 1000 bootstraps (arbitrary for this example)
# if using bootstrapping suggest reviewing references on how to choose the number of replicates required.
# Applying our function above to the data.
mutate(results = map(splits,
ps_glm))
# Calculate the CIs (using 95% CIs)
ipw_cis <- int_pctl(boot_results, results)[2,] %>% # calculating the CIs based on percentiles.
# note: this inherently has some assumptions.
# Reformatting and adding the type of CI (for later)
rename(lower = .lower,
estimate = .estimate,
upper = .upper) %>%
select(lower,
#estimate, - don't want to select the estimate in this case
upper) %>%
mutate(
type = "Bootstrapping",
estimate = tidy(out_mod)[2, 2] %>% as.numeric() # taking estimate from the GLM model.
)
# Parametric G-Formula ----
# The basic idea of the parametric g-formula is to fit a model with the outcome.
# This model is then used to predict values for the outcomes when the treatment is 0
# for all individuals and 1 for all individuals.
# Fitting model
mod <- glm(y ~ x + c1 + c2,
data = df)
# Predicting outcome when treatment status is 0
df_x0 <- df %>%
mutate(x = 0,
ypred = predict(mod) # predicting the outcome based on x being 0
)
# Predicting outcome when treatment is 1
df_x1 <- df %>%
mutate(x = 1,
ypred = predict(mod))
# To calculate the estimate, we'd take the difference when treatment is 1 and treatment is 0.
# We could then take the average of these values.
#... Bootstrapping ----
#..... Setup for Bootstrapping ----
# To estimate the CIs we need to use bootstrapping. Credit to Joy Shi,
# Sean McGrath and Tom Palmer for making this R code freely available
# from "Causal Inference: What If" by Hernan & Robins
standardization <- function(data, indices) {
# We need to first make the datasets that we need
# 1st copy: our original
d <- data[indices, ]
# 2nd copy: Same as original but with group = 0 and outcome as missing
d0 <- d
d0$x <- 0 # setting treatment to 0
d0$y <- NA # setting our outcome to missing
# 3rd copy: Same as original but with group = 1 and outcome as missing
d1 <- d
d1$x <- 1 # setting treatment to 1
d1$y <- NA # setting our outcome to missing
# Making one sample
d.onesample <- rbind(d, d0, d1) # combining datasets
# Fitting a model for each iteration
fit <- glm(
y ~ x + c1 + c2,
data = d.onesample
)
# Using model to predict the outcome
d.onesample$predicted_meanY <- predict(fit, d.onesample)
# Calculate the mean for each of the groups. The third calculation is for the difference in group O (0) vs group A (1)
return(c(
mean(d.onesample$predicted_meanY[d.onesample$x == 0]),
mean(d.onesample$predicted_meanY[d.onesample$x == 1]),
# Treatment - No Treatment
mean(d.onesample$predicted_meanY[d.onesample$x == 1]) -
mean(d.onesample$predicted_meanY[d.onesample$x == 0])
))
}
#..... Actual Bootstrapping ----
# Now we can bootstrap this statistic using our dataset
results <- boot(data = df,
statistic = standardization,
R = 5)
# Using the bootstrapped sample (titled result), we can calculate the confidence intervals.
# We take the standard deviation of the sampling distribution.
# This will give us our standard error.
se <- c(sd(results$t[, 1]),
sd(results$t[, 2]),
sd(results$t[,3]))
mean <- results$t0 # calculate mean
ll <- mean - qnorm(0.975) * se # lower limits
ul <- mean + qnorm(0.975) * se # upper limits
g_formula_results <- data.frame(
result_title = c("Group 0", "Group 1", "Difference"),
mean = round(mean,3),
ci = paste0("95% CI: ", round(ll, 3), " - ", round(ul, 3))
)
# Plotting! ----
#... Getting Necessary Data ----
# Data for Parametric G-Formula
lower_g <- ll[3]
est_g <- mean[3]
upper_g <- ul[3]
# Data for IPTW
lower_ipw <- ipw_cis$lower
est_ipw <- ipw_cis$estimate
upper_ipw <- ipw_cis$upper
# Combining into one dataframe.
df_ci <- data.frame(
method = c("Parametric G-Formula", "IPTW"),
lower = c(lower_g, lower_ipw),
estimate = c(est_g, est_ipw),
upper = c(upper_g, upper_ipw)
)
#... Plot Time! ----
# Creating a plot like a mini forest plot to show the different CIs for the different
# methods.
ggplot(df_ci,
aes(x = method, y = estimate)) +
# Adding the estimates as points.
geom_point(
size = 4,
shape = 21,
color = "hotpink",
fill = "hotpink") +
# Adding standard error bars. The width is set to 0 since I don't want it to
# look like a star wars tie fighter.
geom_errorbar(
aes(ymin = lower, ymax = upper),
linewidth = 2.5,
width = 0,
color = "hotpink") +
coord_flip() + # Flip coordinates for horizontal plot
# Minimal theme
theme_minimal() +
# Adding labels for axes.
labs(
title = "Estimating Treatment Effect",
subtitle = "Parametric G-Formula and IPTW with Bootstrapping (N = 500)",
y = "Estimate (with 95% CI)",
x = "Method") +
# Adding limits. However here this is really based on this particular example.
# If you re-run you may need to adjust accordingly.
lims(y = c(1, 2)) +
#
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
text = element_text(size = 20),
axis.text = element_text(color = "black"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment