Created
April 23, 2026 17:44
-
-
Save battenr/77fa08c05a4af61c5faa04d2b705abe5 to your computer and use it in GitHub Desktop.
Posterior Predictive Checks for Causal Inference
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Title: Posterior Predictive Checks for Bayesian & Frequentist Models | |
| # Description: When fitting a model, we need to check assumptions. In a Bayesian framework, | |
| # one excellent tool is the posterior predictive check. A similar thing can be done with | |
| # frequentist models by simulating data with the model then comparing to the observed data | |
| # This code demonstrates that and how it's useful | |
| # Note: This code was created with the help of Claude Sonnet 4.5 and reviewed by me (R Batten) | |
| # Setup ---- | |
| #... Libraries ---- | |
| library(tidyverse) # ol' faithful | |
| library(splines) # for non-linear term | |
| library(performance) # for the posterior predictive check | |
| library(patchwork) # for combining plots | |
| #... Functions ---- | |
| # Description: Simulate data with a non-linear (quadratic) relationship between z1 and y | |
| sim_data <- function(n = 250, | |
| beta_trt = 1.5, | |
| z1_mean = 5, z1_sd = 2, | |
| z2_size = 1, z2_prob = 0.5, | |
| z1_on_x = 0.05, z2_on_x = 0.2, | |
| z1_on_y = 0.5, z2_on_y = 0.3) { | |
| data.frame( | |
| z1 = rnorm(n = n, mean = z1_mean, sd = z1_sd), | |
| z2 = rbinom(n = n, size = z2_size, prob = z2_prob) | |
| ) %>% | |
| dplyr::mutate( | |
| prob = plogis(z1_on_x * z1 + z2_on_x * z2), | |
| x = rbinom(n = n, size = 1, prob = prob), | |
| # True relationship: quadratic in z1 | |
| y = beta_trt * x + z1_on_y * z1^2 + z2_on_y * z2 + rnorm(n, 0, 1) | |
| ) | |
| } | |
| # Simulate Data ---- | |
| set.seed(456) # Setting seed for reproducibility | |
| df <- sim_data() # simulating data | |
| # Fitting Models ---- | |
| #... Model 1: Linear Terms Only ---- | |
| # Intentionally using the wrong terms here (since z1 isn't linear with y) | |
| mod1 <- glm(y ~ x + z1 + z2, | |
| family = gaussian(link = "identity"), | |
| data = df) | |
| # mod1 %>% broom::tidy(conf.int = TRUE) # optional: check results | |
| #... Model 2: Natural spline for z1 ---- | |
| # This model is "correct" in the sense it's accounting for non-linearity between z1 | |
| # and y | |
| mod2 <- glm(y ~ x + splines::ns(z1, 4) + z2, | |
| family = gaussian(link = "identity"), | |
| data = df) | |
| # mod2 %>% broom::tidy(conf.int = TRUE) # optional: check results | |
| # Check Both Models ---- | |
| # Here using the check_predictions() to simulate data based on the model | |
| p1 <- performance::check_predictions(mod1, iterations = 500) | |
| p2 <-performance::check_predictions(mod2, iterations = 500) | |
| # Posterior Predictive Check Plots ---- | |
| #... Model 1 ---- | |
| p1_plot <- plot(p1) + | |
| labs( | |
| title = "Model 1: Linear", | |
| subtitle = "y ~ x + z1 + z2 | Misspecified: ignores quadratic z1 effect" | |
| ) + | |
| theme_minimal() + | |
| theme( | |
| plot.title = element_text(hjust = 0.5, size = 20, face = "bold"), | |
| plot.subtitle = element_text(hjust = 0.5, size = 20), | |
| text = element_text(size = 20) | |
| ) + | |
| lims(x = c(-20, 60)) | |
| #... Model 2 ---- | |
| p2_plot <- plot(p2, linewidth = 0.5) + | |
| labs( | |
| title = "Model 2: Natural Spline", | |
| subtitle = "y ~ x + ns(z1, 4) + z2 | Flexible: captures non-linearity" | |
| ) + theme_minimal() + | |
| theme( | |
| plot.title = element_text(hjust = 0.5, size = 20, face = "bold"), | |
| plot.subtitle = element_text(hjust = 0.5, size = 20), | |
| text = element_text(size = 20) | |
| ) + | |
| lims(x = c(-20, 60)) | |
| #... Combined ---- | |
| (p1_plot / p2_plot) + | |
| plot_annotation( | |
| title = "Posterior Predictive Checks", | |
| subtitle = paste0( | |
| "Green line = observed data. Blue lines = 500 simulated datasets from the fitted model.\n", | |
| "Good fit: simulated lines should closely envelope the observed distribution." | |
| ), | |
| theme = theme( | |
| plot.title = element_text(face = "bold", size = 22, hjust = 0.5), | |
| plot.subtitle = element_text(color = "grey35", size = 20, lineheight = 1.4, hjust = 0.5), | |
| plot.background = element_rect(fill = "white", color = NA), | |
| text = element_text(size = 20) | |
| ) | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment