paulgp · November 22, 2024 01:00
diff --git a/robust_simulations.R b/robust_simulations.R
 # Set random seed for reproducibility
 set.seed(123)

 # Simulation parameters
 n_clusters <- 100 # Number of clusters
 cluster_size <- 100 # Units per cluster
 n <- n_clusters * cluster_size # Total sample size
 rho <- 0.5 # Within-cluster error correlation
 iterations <- 1000 # Number of simulation iterations

 # Function to generate clustered data with heterogeneous treatment effects
 generate_data <- function() {
    # Generate cluster IDs
    cluster_id <- rep(1:n_clusters, each = cluster_size)

    # Generate correlated errors within clusters
    cluster_effect <- rep(rnorm(n_clusters, sd = sqrt(rho)), each = cluster_size)
    individual_effect <- rnorm(n, sd = sqrt(1 - rho))
    error <- cluster_effect + individual_effect

    # Random treatment assignment at individual level
    treatment <- rbinom(n, 1, 0.5)

    # Generate heterogeneous treatment effects
    # Treatment effect varies by both cluster and individual
    cluster_te <- rep(rnorm(n_clusters, mean = .1, sd = 0.5), each = cluster_size)
    individual_te <- rnorm(n, mean = .1, sd = 0.5)
    treatment_effect <- cluster_te + individual_te

    # Outcome with heterogeneous treatment effects
    y <- treatment * treatment_effect + error

    data.frame(
        y = y,
        treatment = treatment,
        cluster = cluster_id,
        true_effect = treatment_effect
    )
 }

 # Function to estimate model and return different standard errors
 estimate_model <- function(data) {
    # Fit the model
    model <- lm(y ~ treatment, data = data)

    # Get robust (HC2) standard errors
    robust_se <- sqrt(diag(sandwich::vcovHC(model, type = "HC2")))[2]

    # Get clustered standard errors
    clustered_se <- sqrt(diag(sandwich::vcovCL(model, cluster = data$cluster)))[2]

    # Return coefficient and both SEs
    c(
        coef = coef(model)[2],
        robust_se = robust_se,
        clustered_se = clustered_se
    )
 }

 # Run simulation
 results <- replicate(iterations, {
    data <- generate_data()
    # Calculate true average treatment effect for this sample
    true_ate <- mean(data$true_effect)
    c(estimate_model(data), true_ate = true_ate)
 })

 # Calculate coverage rates
 alpha <- 0.05

 coverage_robust <- mean(abs((results[1, ] - results[4, ]) / results[2, ]) < qnorm(1 - alpha / 2))
 coverage_clustered <- mean(abs((results[1, ] - results[4, ]) / results[3, ]) < qnorm(1 - alpha / 2))

 # Print results
 cat("\nResults from", iterations, "iterations:\n")
 cat("\nAverage estimated treatment effect:", mean(results[1, ]))
 cat("\nAverage true treatment effect:", mean(results[4, ]))
 cat("\n\nAverage SEs:\n")
 cat("Robust SE:     ", mean(results[2, ]), "\n")
 cat("Clustered SE:  ", mean(results[3, ]), "\n")
 cat("\nCoverage rates for 95% confidence intervals:\n")
 cat("Robust SE:     ", round(coverage_robust * 100, 1), "%\n")
 cat("Clustered SE:  ", round(coverage_clustered * 100, 1), "%\n")

 # Calculate ratio of clustered to robust SEs
 se_ratio <- results[3, ] / results[2, ]
 cat("\nAverage ratio of Clustered to Robust SEs:", mean(se_ratio), "\n")

 # Calculate empirical standard deviation of estimates
 cat("\nEmpirical SD of estimates:", sd(results[1, ]), "\n")
	# Set random seed for reproducibility
	set.seed(123)

	# Simulation parameters
	n_clusters <- 100 # Number of clusters
	cluster_size <- 100 # Units per cluster
	n <- n_clusters * cluster_size # Total sample size
	rho <- 0.5 # Within-cluster error correlation
	iterations <- 1000 # Number of simulation iterations

	# Function to generate clustered data with heterogeneous treatment effects
	generate_data <- function() {
	# Generate cluster IDs
	cluster_id <- rep(1:n_clusters, each = cluster_size)

	# Generate correlated errors within clusters
	cluster_effect <- rep(rnorm(n_clusters, sd = sqrt(rho)), each = cluster_size)
	individual_effect <- rnorm(n, sd = sqrt(1 - rho))
	error <- cluster_effect + individual_effect

	# Random treatment assignment at individual level
	treatment <- rbinom(n, 1, 0.5)

	# Generate heterogeneous treatment effects
	# Treatment effect varies by both cluster and individual
	cluster_te <- rep(rnorm(n_clusters, mean = .1, sd = 0.5), each = cluster_size)
	individual_te <- rnorm(n, mean = .1, sd = 0.5)
	treatment_effect <- cluster_te + individual_te

	# Outcome with heterogeneous treatment effects
	y <- treatment * treatment_effect + error

	data.frame(
	y = y,
	treatment = treatment,
	cluster = cluster_id,
	true_effect = treatment_effect
	)
	}

	# Function to estimate model and return different standard errors
	estimate_model <- function(data) {
	# Fit the model
	model <- lm(y ~ treatment, data = data)

	# Get robust (HC2) standard errors
	robust_se <- sqrt(diag(sandwich::vcovHC(model, type = "HC2")))[2]

	# Get clustered standard errors
	clustered_se <- sqrt(diag(sandwich::vcovCL(model, cluster = data$cluster)))[2]

	# Return coefficient and both SEs
	c(
	coef = coef(model)[2],
	robust_se = robust_se,
	clustered_se = clustered_se
	)
	}

	# Run simulation
	results <- replicate(iterations, {
	data <- generate_data()
	# Calculate true average treatment effect for this sample
	true_ate <- mean(data$true_effect)
	c(estimate_model(data), true_ate = true_ate)
	})

	# Calculate coverage rates
	alpha <- 0.05

	coverage_robust <- mean(abs((results[1, ] - results[4, ]) / results[2, ]) < qnorm(1 - alpha / 2))
	coverage_clustered <- mean(abs((results[1, ] - results[4, ]) / results[3, ]) < qnorm(1 - alpha / 2))

	# Print results
	cat("\nResults from", iterations, "iterations:\n")
	cat("\nAverage estimated treatment effect:", mean(results[1, ]))
	cat("\nAverage true treatment effect:", mean(results[4, ]))
	cat("\n\nAverage SEs:\n")
	cat("Robust SE: ", mean(results[2, ]), "\n")
	cat("Clustered SE: ", mean(results[3, ]), "\n")
	cat("\nCoverage rates for 95% confidence intervals:\n")
	cat("Robust SE: ", round(coverage_robust * 100, 1), "%\n")
	cat("Clustered SE: ", round(coverage_clustered * 100, 1), "%\n")

	# Calculate ratio of clustered to robust SEs
	se_ratio <- results[3, ] / results[2, ]
	cat("\nAverage ratio of Clustered to Robust SEs:", mean(se_ratio), "\n")

	# Calculate empirical standard deviation of estimates
	cat("\nEmpirical SD of estimates:", sd(results[1, ]), "\n")