apoorvalal · June 3, 2024 05:03
diff --git a/linRegressionInference.R b/linRegressionInference.R
 library(momentfit); library(car); library(tictoc)
 set.seed(42)
 # %%
 dgp = \(n=500, k = 2){
  X = matrix(rnorm(n * 2), n, 2)
  Y1 = X[, 1] + X[, 1]^2 + runif(n, -0.5, 0.5)
  Y0 = X[, 2] + X[, 2]^2 + runif(n, -1, 1)
  Z = rbinom(n, 1, 0.6)
  Y = Z * Y1 + (1-Z) * Y0
  data.frame(Y, Z, X)
 }
 df = dgp()
 # %%
 linestimator = function(Z, Y, X) {
  X = scale(X); n = dim(X)[1]; p = dim(X)[2]
  linreg = lm(Y ~ Z * X)
  est = coef(linreg)[2]; vehw = hccm(linreg)[2, 2]
  ## super population correction
  inter = coef(linreg)[(p + 3):(2 * p + 2)]
  vsuper = vehw + sum(inter * (cov(X) %*% inter)) / n
  c(est, sqrt(vehw), sqrt(vsuper)) %>% setNames(c("OLS Est", "EHW SE", "SuperPop SE"))
 }

 # %% estimate problem jointly via gmm
 moment_cond = function(theta, df) {
  p = dim(df)[2]-2
  # unpack params: μ_x , β
  μx = theta[1:p]; β = theta[(p+1):length(theta)]
  # unpack data
  y = as.numeric(df[, 1]); z = as.numeric(df[, 2]); x = as.matrix(df[, 3:ncol(df)])
  xcent = sweep(x, 2, μx) # X - μx
  xtilde = cbind(z, 1, x, xcent)
  m = xtilde * as.vector(y -(xtilde %*% β)) # ols moment condition x(y - xβ)
  cbind(xcent, m) # stack moment conditions -  [μx, β]
 }
 tic()
 linestimator(df$Z, df$Y, df[,3:ncol(df)])
 toc()
 # gmm
 tic()
 m = gel4(g = moment_cond, x = df, theta0 = runif(8), coefSlv = "optim", gelType = "ET")
 toc()
 summary(m)@coef[3,1:2] %>% setNames(c("GMM Est", "GMM SE")) # 3rd coef is treatment effect 
 # OLS Est-0.328529825912218EHW SE0.168275957745803SuperPop SE0.177575093923765
 # GMM Est-0.327120815478441GMM SE0.164029681609099
 # %% repeat
 tic()
 res = LalRUtils::mcReplicate(5000, {
  df = dgp()
  linres = linestimator(df$Z, df$Y, df[,3:ncol(df)])
  m = gel4(g = moment_cond, x = df, theta0 = runif(8), coefSlv = "optim", gelType = "ET")
  gmmres = summary(m)@coef[3,1:2] # coef and SE from GMM estimation
  c(linres, gmmres)
 }, mc.cores = 12)
 toc()
 # %% ## bias
 mean(res[1, ]) # ols
 # 0.00750274551064356
 mean(res[4, ]) # gmm
 # 0.00436392500046487 # lower bias
 # %% coverage based on EHW standard error
 mean((res[1, ] - 1.96 * res[2, ]) * (res[1, ] + 1.96 * res[2, ]) <= 0)
 # 0.9268 - under-coverage
 # %% coverage based on population standard error
 mean((res[1, ] - 1.96 * res[3, ]) * (res[1, ] + 1.96 * res[3, ]) <= 0)
 # 0.9548 - correct coverage but conservative
 # %% coverage based on gmm standard error
 mean((res[4, ] - 1.96 * res[5, ]) * (res[4, ] + 1.96 * res[5, ]) <= 0)
 # 0.952 - correct coverage, less conservative
 # %%
	library(momentfit); library(car); library(tictoc)
	set.seed(42)
	# %%
	dgp = \(n=500, k = 2){
	X = matrix(rnorm(n * 2), n, 2)
	Y1 = X[, 1] + X[, 1]^2 + runif(n, -0.5, 0.5)
	Y0 = X[, 2] + X[, 2]^2 + runif(n, -1, 1)
	Z = rbinom(n, 1, 0.6)
	Y = Z * Y1 + (1-Z) * Y0
	data.frame(Y, Z, X)
	}
	df = dgp()
	# %%
	linestimator = function(Z, Y, X) {
	X = scale(X); n = dim(X)[1]; p = dim(X)[2]
	linreg = lm(Y ~ Z * X)
	est = coef(linreg)[2]; vehw = hccm(linreg)[2, 2]
	## super population correction
	inter = coef(linreg)[(p + 3):(2 * p + 2)]
	vsuper = vehw + sum(inter * (cov(X) %*% inter)) / n
	c(est, sqrt(vehw), sqrt(vsuper)) %>% setNames(c("OLS Est", "EHW SE", "SuperPop SE"))
	}

	# %% estimate problem jointly via gmm
	moment_cond = function(theta, df) {
	p = dim(df)[2]-2
	# unpack params: μ_x , β
	μx = theta[1:p]; β = theta[(p+1):length(theta)]
	# unpack data
	y = as.numeric(df[, 1]); z = as.numeric(df[, 2]); x = as.matrix(df[, 3:ncol(df)])
	xcent = sweep(x, 2, μx) # X - μx
	xtilde = cbind(z, 1, x, xcent)
	m = xtilde * as.vector(y -(xtilde %*% β)) # ols moment condition x(y - xβ)
	cbind(xcent, m) # stack moment conditions - [μx, β]
	}
	tic()
	linestimator(df$Z, df$Y, df[,3:ncol(df)])
	toc()
	# gmm
	tic()
	m = gel4(g = moment_cond, x = df, theta0 = runif(8), coefSlv = "optim", gelType = "ET")
	toc()
	summary(m)@coef[3,1:2] %>% setNames(c("GMM Est", "GMM SE")) # 3rd coef is treatment effect
	# OLS Est-0.328529825912218EHW SE0.168275957745803SuperPop SE0.177575093923765
	# GMM Est-0.327120815478441GMM SE0.164029681609099
	# %% repeat
	tic()
	res = LalRUtils::mcReplicate(5000, {
	df = dgp()
	linres = linestimator(df$Z, df$Y, df[,3:ncol(df)])
	m = gel4(g = moment_cond, x = df, theta0 = runif(8), coefSlv = "optim", gelType = "ET")
	gmmres = summary(m)@coef[3,1:2] # coef and SE from GMM estimation
	c(linres, gmmres)
	}, mc.cores = 12)
	toc()
	# %% ## bias
	mean(res[1, ]) # ols
	# 0.00750274551064356
	mean(res[4, ]) # gmm
	# 0.00436392500046487 # lower bias
	# %% coverage based on EHW standard error
	mean((res[1, ] - 1.96 * res[2, ]) * (res[1, ] + 1.96 * res[2, ]) <= 0)
	# 0.9268 - under-coverage
	# %% coverage based on population standard error
	mean((res[1, ] - 1.96 * res[3, ]) * (res[1, ] + 1.96 * res[3, ]) <= 0)
	# 0.9548 - correct coverage but conservative
	# %% coverage based on gmm standard error
	mean((res[4, ] - 1.96 * res[5, ]) * (res[4, ] + 1.96 * res[5, ]) <= 0)
	# 0.952 - correct coverage, less conservative
	# %%