Skip to content

Instantly share code, notes, and snippets.

@paulgp
Created November 22, 2024 01:00
Show Gist options
  • Save paulgp/6d8a0c452bc8a0da7f4c054314d7e59e to your computer and use it in GitHub Desktop.
Save paulgp/6d8a0c452bc8a0da7f4c054314d7e59e to your computer and use it in GitHub Desktop.
Simulations that follow Abadie et al.'s paper "When Should You Adjust Standard Errors for Clustering?"
# Set random seed for reproducibility
set.seed(123)
# Simulation parameters
n_clusters <- 100 # Number of clusters
cluster_size <- 100 # Units per cluster
n <- n_clusters * cluster_size # Total sample size
rho <- 0.5 # Within-cluster error correlation
iterations <- 1000 # Number of simulation iterations
# Function to generate clustered data with heterogeneous treatment effects
generate_data <- function() {
# Generate cluster IDs
cluster_id <- rep(1:n_clusters, each = cluster_size)
# Generate correlated errors within clusters
cluster_effect <- rep(rnorm(n_clusters, sd = sqrt(rho)), each = cluster_size)
individual_effect <- rnorm(n, sd = sqrt(1 - rho))
error <- cluster_effect + individual_effect
# Random treatment assignment at individual level
treatment <- rbinom(n, 1, 0.5)
# Generate heterogeneous treatment effects
# Treatment effect varies by both cluster and individual
cluster_te <- rep(rnorm(n_clusters, mean = .1, sd = 0.5), each = cluster_size)
individual_te <- rnorm(n, mean = .1, sd = 0.5)
treatment_effect <- cluster_te + individual_te
# Outcome with heterogeneous treatment effects
y <- treatment * treatment_effect + error
data.frame(
y = y,
treatment = treatment,
cluster = cluster_id,
true_effect = treatment_effect
)
}
# Function to estimate model and return different standard errors
estimate_model <- function(data) {
# Fit the model
model <- lm(y ~ treatment, data = data)
# Get robust (HC2) standard errors
robust_se <- sqrt(diag(sandwich::vcovHC(model, type = "HC2")))[2]
# Get clustered standard errors
clustered_se <- sqrt(diag(sandwich::vcovCL(model, cluster = data$cluster)))[2]
# Return coefficient and both SEs
c(
coef = coef(model)[2],
robust_se = robust_se,
clustered_se = clustered_se
)
}
# Run simulation
results <- replicate(iterations, {
data <- generate_data()
# Calculate true average treatment effect for this sample
true_ate <- mean(data$true_effect)
c(estimate_model(data), true_ate = true_ate)
})
# Calculate coverage rates
alpha <- 0.05
coverage_robust <- mean(abs((results[1, ] - results[4, ]) / results[2, ]) < qnorm(1 - alpha / 2))
coverage_clustered <- mean(abs((results[1, ] - results[4, ]) / results[3, ]) < qnorm(1 - alpha / 2))
# Print results
cat("\nResults from", iterations, "iterations:\n")
cat("\nAverage estimated treatment effect:", mean(results[1, ]))
cat("\nAverage true treatment effect:", mean(results[4, ]))
cat("\n\nAverage SEs:\n")
cat("Robust SE: ", mean(results[2, ]), "\n")
cat("Clustered SE: ", mean(results[3, ]), "\n")
cat("\nCoverage rates for 95% confidence intervals:\n")
cat("Robust SE: ", round(coverage_robust * 100, 1), "%\n")
cat("Clustered SE: ", round(coverage_clustered * 100, 1), "%\n")
# Calculate ratio of clustered to robust SEs
se_ratio <- results[3, ] / results[2, ]
cat("\nAverage ratio of Clustered to Robust SEs:", mean(se_ratio), "\n")
# Calculate empirical standard deviation of estimates
cat("\nEmpirical SD of estimates:", sd(results[1, ]), "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment