Skip to content

Instantly share code, notes, and snippets.

@mingjiphd
Last active January 27, 2026 18:42
Show Gist options
  • Select an option

  • Save mingjiphd/21d08af7c541a1779defd223fddedf40 to your computer and use it in GitHub Desktop.

Select an option

Save mingjiphd/21d08af7c541a1779defd223fddedf40 to your computer and use it in GitHub Desktop.
Causal Analysis using R with More Sophisticated DAGs
This R script is a step by step tutorial on how to do causal analysis using R with more sophisticated DAGs. Topics covered include: How to generate simulated data for DAG; How to specify a DAG model; How to plot a DAG model; Minimal Adjustment Sets; D-Separation; Conditional Independencies; and Instrumental Variables.
A step by step video demonstration can be found at: https://youtu.be/oN9qJmVKKmE
### Causal Analysis using R: Sophisticated DAGs
###Generate Data
# Install if needed
#if (!requireNamespace("simDAG", quietly = TRUE)) {
# install.packages("simDAG")
#}
library(simDAG)
# Define DAG with root nodes and child nodes with parents and regression formulas
dag <- empty_dag() +
# Root nodes with distributions
node("age", type = "rnorm", mean = 50, sd = 10) +
node("sex", type = "rbernoulli", p = 0.5) +
# Child continuous node "bmi" depends on age and sex
node(
"bmi",
type = "gaussian",
parents = c("age", "sex"),
formula = ~ -2 + age * 0.3 + sexTRUE * (-0.2),
error = 5
) +
# Child binary node "smoker" depends on age and sex
node(
"smoker",
type = "binomial",
parents = c("age", "sex"),
formula = ~ -4 + age * 0.05 + sexTRUE * 0.8
) +
# Child continuous node "blood_pressure" depends on bmi and smoker
node(
"blood_pressure",
type = "gaussian",
parents = c("bmi", "smoker"),
formula = ~ 120 + bmi * 1.2 + smoker * 3.0,
error = 8
) +
# Outcome binary node "heart_disease" depends on blood_pressure and age
node(
"heart_disease",
type = "binomial",
parents = c("blood_pressure", "age"),
formula = ~ -6 + blood_pressure * 0.08 + age * 0.01
)
# Simulate 1000 samples from the DAG
set.seed(123)
sim_data <- sim_from_dag(dag, n_sim = 1000)
# View first few rows
head(sim_data)
##Visualization
# Install ggdag if needed
#if (!requireNamespace("ggdag", quietly = TRUE)) {
# install.packages("ggdag")
#}
library(ggdag)
library(ggplot2)
# Define the DAG with relationships as in the simulation
dag_object <- dagify(
bmi ~ age + sex,
smoker ~ age + sex,
blood_pressure ~ bmi + smoker,
heart_disease ~ blood_pressure + age,
exposure = "smoker",
outcome = "heart_disease"
)
# Plot the DAG with ggdag
library(ggplot2)
ggdag(dag_object) +
theme_dag() +
ggtitle("DAG Visualization for Simulated Data")
### Minimal Adjustment Sets
library(dagitty)
library(ggdag)
# Define DAG in dagitty
dag_code <- '
dag {
age -> bmi
sex -> bmi
age -> smoker
sex -> smoker
bmi -> blood_pressure
smoker -> blood_pressure
blood_pressure -> heart_disease
age -> heart_disease
}
'
dag_obj <- dagitty(dag_code)
exposures(dag_obj) <- "smoker"
outcomes(dag_obj) <- "heart_disease"
# Get minimal adjustment sets
adj_sets <- adjustmentSets(dag_obj)
print(adj_sets) # e.g., list of character vectors
# Take first minimal adjustment set as example
first_set <- adj_sets[[1]]
# Visualize adjustment set with ggdag, note use of 'adjusted' parameter
ggdag_adjustment_set(dag_obj, exposure = "smoker", outcome = "heart_disease", shadow = TRUE) +
theme_dag() +
ggplot2::ggtitle("Minimal Adjustment Sets Visualization")
###Further Analysis suing the Minimal Adjustment Set
# Assume you have sim_data from the previous example and minimal adjustment set identified, e.g.:
minimal_adj_set <- c("age", "bmi") # Example minimal adjustment set from previous step
# Fit logistic regression for binary outcome 'heart_disease' on exposure 'smoker'
# adjusting for the minimal adjustment set
model <- glm(heart_disease ~ smoker + age + bmi,
data = sim_data,
family = binomial(link = "logit"))
# Summary of the model to see coefficients and significance
summary(model)
# Interpret the coefficient for 'smoker' as the estimated causal effect adjusted for confounding
### More Advanced Options
# Install packages if needed
#if (!requireNamespace("dagitty", quietly = TRUE)) install.packages("dagitty")
#if (!requireNamespace("ggdag", quietly = TRUE)) install.packages("ggdag")
library(dagitty)
library(ggdag)
# Define DAG in dagitty syntax with coordinates for better plotting
# Create the DAG object (your original specification)
dag <- dagitty('dag {
age [pos="0.100,0.500"]
blood_pressure [pos="0.700,0.400"]
bmi [pos="0.400,0.500"]
heart_disease [outcome,pos="0.900,0.300"]
sex [pos="0.100,0.200"]
smoker [exposure,pos="0.400,0.200"]
age -> bmi
age -> heart_disease
age -> smoker
blood_pressure -> heart_disease
bmi -> blood_pressure
sex -> bmi
sex -> smoker
smoker -> blood_pressure
}')
# Explicitly set the coordinates (names and values as in your file)
coordinates(dag) <- list(
x = c(age = 0.1, blood_pressure = 0.7, bmi = 0.4, heart_disease = 0.9,
sex = 0.1, smoker = 0.4, stress = 0.2),
y = c(age = 0.5, blood_pressure = 0.4, bmi = 0.5, heart_disease = 0.3,
sex = 0.2, smoker = 0.2, stress = 0.6)
)
# Plot with your coordinates
plot(dag)
# Use ggdag to plot with ggplot2 style
ggdag(dag) +
theme_dag() +
ggtitle("Causal DAG with Coordinates and Layout")
# Check d-separation: Is 'bmi' a confounder that blocks backdoor path?
#D-separation is a graphical criterion that tells you whether two variables
#are conditionally independent given a set of other variables, based on the
#structure of a causal DAG.
dseparated(dag, "smoker", "heart_disease", c("bmi"))
# List all testable conditional independencies implied by the DAG
#The output of impliedConditionalIndependencies(dag) lists the conditional
#independence statements that your DAG implies must hold true in any data generated
#from the causal structure.
impliedConditionalIndependencies(dag)
# Identify all minimal sufficient adjustment sets
adj_sets <- adjustmentSets(dag, exposure = "smoker", outcome = "heart_disease")
print(adj_sets)
# Find instrumental variables (IVs) for 'smoker' -> 'heart_disease' effect if any
# An IV must be associated with the exposure ("smoker") but affects the outcome ("heart_disease")
# only through this exposure and not via confounding paths.
ivs <- instrumentalVariables(dag_obj, exposure = "smoker", outcome = "heart_disease")
print(ivs)
# Visualize one minimal adjustment set (the first one)
# Visualize all minimal adjustment sets (no manual passing of sets)
ggdag_adjustment_set(
dag_obj,
exposure = "smoker",
outcome = "heart_disease",
shadow = TRUE
) +
theme_dag() +
ggtitle("Minimal Adjustment Sets Visualization")
# Set manual coordinates on the dag object
coordinates(dag_obj) <- list(
x = c(age = 0.1, sex = 0.1, bmi = 0.4, smoker = 0.4, blood_pressure = 0.7, heart_disease = 0.9),
y = c(age = 0.5, sex = 0.2, bmi = 0.5, smoker = 0.2, blood_pressure = 0.4, heart_disease = 0.3)
)
# Plot adjustment sets using the same dag_obj
p <- ggdag_adjustment_set(
dag_obj,
exposure = "smoker",
outcome = "heart_disease",
shadow = TRUE
) + theme_dag() + ggtitle("Minimal Adjustment Sets Visualization")
print(p)
# Export DAG to text file for use outside R
writeLines(as.character(dag), "dagitty_export.dag")
A step by step video demonstration can be found at: https://youtu.be/oN9qJmVKKmE
@mingjiphd
Copy link
Author

A step by step video demonstration can be found at: https://youtu.be/oN9qJmVKKmE

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment