Created
October 31, 2024 12:58
-
-
Save vankesteren/4e9cf9e4975ccd535e83d409a960db26 to your computer and use it in GitHub Desktop.
Simulating data, estimating effects, and computing intervention effects with uncertainty propagation in a simple causal graph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Packages for this script | |
library(tidyverse) | |
library(lavaan) | |
library(dagitty) | |
library(glue) | |
library(rstanarm) | |
## SIMULATION ## | |
# here is a true data-generating process in lavaan (SEM) syntax | |
dgp <- " | |
A ~ 0.4*D | |
B ~ 0.2*D + 0.8*A + -0.1*C | |
C ~ -0.3*D + -0.3*A + 0.2*E | |
D ~ 1 | |
E ~ 1 | |
F ~ 0.9*B + -0.4*C + -0.3*E | |
" | |
df <- simulateData(dgp, sample.nobs = 20000) | |
dag <- lavaanToGraph(dgp) | |
plot(dag, show.coefficients = TRUE) | |
## ESTIMATION ## | |
# how can we find the adjustment set? Use DAG rules! | |
adjustmentSets(dag, exposure = "A", outcome = "B", effect = "direct") | |
coef(lm(B ~ A + C + D, data = df))[2] | |
# a function to compute any of these paths with OLS | |
ols_estimate_path <- function(df, dag, source = "A", target = "B") { | |
adjustment_set <- adjustmentSets(dag, exposure = source, outcome = target, effect = "direct")[[1]] | |
adj_set_string <- paste(adjustment_set, collapse = ', ') | |
print(glue("Identifying {source} -> {target} using adjustment set {adj_set_string}")) | |
adj_set_frm <- paste(adjustment_set, collapse = ' + ') | |
frm <- glue("{target} ~ {source} + {adj_set_frm}") | |
unname(coef(lm(frm, df))[2]) | |
} | |
ols_estimate_path(df, dag, "E", "F") | |
## PREDICTION ## | |
# I will re-estimate relevant models in a Bayesian way to easily do posterior prediction / forward simulation | |
mod_c <- stan_lm(formula = C ~ A + D, data = df, prior = NULL, algorithm = "fullrank") | |
mod_b <- stan_lm(formula = B ~ A + C + D, data = df, prior = NULL, algorithm = "fullrank") | |
mod_f <- stan_lm(formula = F ~ C + B + E, data = df, prior = NULL, algorithm = "fullrank") | |
# now, let's simulate an intervention on A and find its effect on F | |
# Adding 1.3 to A | |
df_int <- df | |
df_int$A <- df_int$A + 1.3 | |
# what would C be? | |
df_int$C <- posterior_predict(mod_c, df_int, draws = 1)[1,] | |
tibble(x = c(df$C, df_int$C), type = rep(c("original", "intervention"), each = nrow(df))) |> | |
ggplot(aes(x = x, fill = type)) + | |
geom_density() + | |
labs(title = "Distribution of C") | |
# what would B be? | |
df_int$B <- posterior_predict(mod_b, df_int, draws = 1)[1,] | |
tibble(x = c(df$B, df_int$B), type = rep(c("original", "intervention"), each = nrow(df))) |> | |
ggplot(aes(x = x, fill = type)) + | |
geom_density() + | |
labs(title = "Distribution of B") | |
# finally, what would F be? | |
df_int$F <- posterior_predict(mod_f, df_int, draws = 1)[1,] | |
tibble(x = c(df$F, df_int$F), type = rep(c("original", "intervention"), each = nrow(df))) |> | |
ggplot(aes(x = x, fill = type)) + | |
geom_density() + | |
labs(title = "Distribution of F") | |
# the average total causal effect | |
mean(df_int$F - df$F) |
Author
vankesteren
commented
Nov 1, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment