vankesteren · October 31, 2024 12:58 · vankesteren · Nov 1, 2024
diff --git a/causal_graph_sim.R b/causal_graph_sim.R
 # Packages for this script
 library(tidyverse)
 library(lavaan)
 library(dagitty)
 library(glue)
 library(rstanarm)


 ## SIMULATION ##
 # here is a true data-generating process in lavaan (SEM) syntax
 dgp <- "
  A ~ 0.4*D
  B ~ 0.2*D + 0.8*A + -0.1*C
  C ~ -0.3*D + -0.3*A + 0.2*E
  D ~ 1
  E ~ 1
  F ~ 0.9*B + -0.4*C + -0.3*E
 "

 df <- simulateData(dgp, sample.nobs = 20000)

 dag <- lavaanToGraph(dgp)
 plot(dag, show.coefficients = TRUE)


 ## ESTIMATION ##
 # how can we find the adjustment set? Use DAG rules!
 adjustmentSets(dag, exposure = "A", outcome = "B", effect = "direct")
 coef(lm(B ~ A + C + D, data = df))[2]

 # a function to compute any of these paths with OLS
 ols_estimate_path <- function(df, dag, source = "A", target = "B") {
  adjustment_set <- adjustmentSets(dag, exposure = source, outcome = target, effect = "direct")[[1]]
  adj_set_string <- paste(adjustment_set, collapse = ', ')
  print(glue("Identifying {source} -> {target} using adjustment set {adj_set_string}"))
  adj_set_frm <- paste(adjustment_set, collapse = ' + ')
  frm <- glue("{target} ~ {source} + {adj_set_frm}")
  unname(coef(lm(frm, df))[2])
 }

 ols_estimate_path(df, dag, "E", "F")


 ## PREDICTION ##
 # I will re-estimate relevant models in a Bayesian way to easily do posterior prediction / forward simulation
 mod_c <- stan_lm(formula = C ~ A + D,     data = df, prior = NULL, algorithm = "fullrank")
 mod_b <- stan_lm(formula = B ~ A + C + D, data = df, prior = NULL, algorithm = "fullrank")
 mod_f <- stan_lm(formula = F ~ C + B + E, data = df, prior = NULL, algorithm = "fullrank")

 # now, let's simulate an intervention on A and find its effect on F
 # Adding 1.3 to A
 df_int <- df
 df_int$A <- df_int$A + 1.3

 # what would C be?
 df_int$C <- posterior_predict(mod_c, df_int, draws = 1)[1,]

 tibble(x = c(df$C, df_int$C), type = rep(c("original", "intervention"), each = nrow(df))) |> 
  ggplot(aes(x = x, fill = type)) +
  geom_density() +
  labs(title = "Distribution of C")


 # what would B be?
 df_int$B <- posterior_predict(mod_b, df_int, draws = 1)[1,]

 tibble(x = c(df$B, df_int$B), type = rep(c("original", "intervention"), each = nrow(df))) |> 
  ggplot(aes(x = x, fill = type)) +
  geom_density() +
  labs(title = "Distribution of B")

 # finally, what would F be?
 df_int$F <- posterior_predict(mod_f, df_int, draws = 1)[1,]

 tibble(x = c(df$F, df_int$F), type = rep(c("original", "intervention"), each = nrow(df))) |> 
  ggplot(aes(x = x, fill = type)) +
  geom_density() +
  labs(title = "Distribution of F")

 # the average total causal effect
 mean(df_int$F - df$F)
	# Packages for this script
	library(tidyverse)
	library(lavaan)
	library(dagitty)
	library(glue)
	library(rstanarm)


	## SIMULATION ##
	# here is a true data-generating process in lavaan (SEM) syntax
	dgp <- "
	A ~ 0.4*D
	B ~ 0.2D + 0.8A + -0.1*C
	C ~ -0.3D + -0.3A + 0.2*E
	D ~ 1
	E ~ 1
	F ~ 0.9B + -0.4C + -0.3*E
	"

	df <- simulateData(dgp, sample.nobs = 20000)

	dag <- lavaanToGraph(dgp)
	plot(dag, show.coefficients = TRUE)


	## ESTIMATION ##
	# how can we find the adjustment set? Use DAG rules!
	adjustmentSets(dag, exposure = "A", outcome = "B", effect = "direct")
	coef(lm(B ~ A + C + D, data = df))[2]

	# a function to compute any of these paths with OLS
	ols_estimate_path <- function(df, dag, source = "A", target = "B") {
	adjustment_set <- adjustmentSets(dag, exposure = source, outcome = target, effect = "direct")[[1]]
	adj_set_string <- paste(adjustment_set, collapse = ', ')
	print(glue("Identifying {source} -> {target} using adjustment set {adj_set_string}"))
	adj_set_frm <- paste(adjustment_set, collapse = ' + ')
	frm <- glue("{target} ~ {source} + {adj_set_frm}")
	unname(coef(lm(frm, df))[2])
	}

	ols_estimate_path(df, dag, "E", "F")


	## PREDICTION ##
	# I will re-estimate relevant models in a Bayesian way to easily do posterior prediction / forward simulation
	mod_c <- stan_lm(formula = C ~ A + D, data = df, prior = NULL, algorithm = "fullrank")
	mod_b <- stan_lm(formula = B ~ A + C + D, data = df, prior = NULL, algorithm = "fullrank")
	mod_f <- stan_lm(formula = F ~ C + B + E, data = df, prior = NULL, algorithm = "fullrank")

	# now, let's simulate an intervention on A and find its effect on F
	# Adding 1.3 to A
	df_int <- df
	df_int$A <- df_int$A + 1.3

	# what would C be?
	df_int$C <- posterior_predict(mod_c, df_int, draws = 1)[1,]

	tibble(x = c(df$C, df_int$C), type = rep(c("original", "intervention"), each = nrow(df))) \|>
	ggplot(aes(x = x, fill = type)) +
	geom_density() +
	labs(title = "Distribution of C")


	# what would B be?
	df_int$B <- posterior_predict(mod_b, df_int, draws = 1)[1,]

	tibble(x = c(df$B, df_int$B), type = rep(c("original", "intervention"), each = nrow(df))) \|>
	ggplot(aes(x = x, fill = type)) +
	geom_density() +
	labs(title = "Distribution of B")

	# finally, what would F be?
	df_int$F <- posterior_predict(mod_f, df_int, draws = 1)[1,]

	tibble(x = c(df$F, df_int$F), type = rep(c("original", "intervention"), each = nrow(df))) \|>
	ggplot(aes(x = x, fill = type)) +
	geom_density() +
	labs(title = "Distribution of F")

	# the average total causal effect
	mean(df_int$F - df$F)