Last active
April 24, 2023 18:39
-
-
Save joshuajnoble/c3f6e172fc2dcd3204530827b4e7c3fe to your computer and use it in GitHub Desktop.
Causality Reading : Double ML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
transform_data <- function(c, i){ | |
randnum = runif(1) | |
if (round(i/3) == i/3){ | |
return(cos(c + 1)) | |
} | |
else if (round(i/4) == i/4){ | |
return(sin(c + 1)) | |
} | |
else if (round(i/2) == i/2){ | |
return(c^2) | |
} | |
else { | |
return(exp(c) ) | |
} | |
} | |
generate_data <- function (n, n_confounders = 6, effect_size = 2){ | |
c0 = rnorm(n) | |
df = data.frame(c0 = c0) #at least one confounder with a linear effect | |
m1 = c0 + rnorm(n) | |
m2 = c0 + rnorm(n) | |
for (i in 1:(n_confounders - 1)){ | |
c = rnorm(n) | |
m1 = m1 + transform_data(c, i) | |
m2 = m2 + transform_data(c, i + 1) | |
df[,paste0("c", as.character(i))] = c | |
} | |
x = rnorm(n, sd = sd(m1) / 2) + m1 #we want x to be mostly comprised of effects from the confounders | |
x = (x - mean(x)) / sd(x) #rescale to be the same as other covariates | |
y = x * effect_size + m2 + rnorm(n, sd = sd(m2) /10) | |
df$x = x | |
df$y = y | |
return(df) | |
} | |
get_estimates <- function(n = 500){ | |
df = generate_data(n) | |
df_doubleml = DoubleML::double_ml_data_from_data_frame(df, y_col = c("y"), d_cols = c("x")) | |
dml_ranger = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.ranger"), mlr3::lrn("regr.ranger")) | |
dml_ranger$fit() | |
dml_ranger_df = data.frame(dml_ranger$confint()) | |
colnames(dml_ranger_df) <- c("dml_ranger_lower", "dml_ranger_upper") | |
dml_ranger_df$dml_ranger_estimate = dml_ranger$all_coef[1,1] | |
dml_xgb = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.xgboost"), mlr3::lrn("regr.xgboost")) | |
dml_xgb$fit() | |
dml_xgb_df = data.frame(dml_xgb$confint()) | |
colnames(dml_xgb_df) <- c("dml_xgb_lower", "dml_xgb_upper") | |
dml_xgb_df$dml_xgb_estimate = dml_xgb$all_coef[1,1] | |
dml_svm = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.svm"), mlr3::lrn("regr.svm")) | |
dml_svm$fit() | |
dml_svm_df = data.frame(dml_svm$confint()) | |
colnames(dml_svm_df) <- c("dml_svm_lower", "dml_svm_upper") | |
dml_svm_df$dml_svm_estimate = dml_svm$all_coef[1,1] | |
x_var = summary(lm(y ~ ., data = df))$coefficients["x",] | |
linear_df = data.frame((x_var["Estimate"] + c(-1,0, 1) *qnorm(.975) * x_var["Std. Error"]) |> t()) | |
colnames(linear_df) <- c("linear_lower", "linear_estimate", "linear_upper") | |
spline_mod = lm(y ~ x + splines::ns(c0) + splines::ns(c1, df = 3) + splines::ns(c2, df = 3) + splines::ns(c3, df = 3), data = df) | |
x_var = summary(spline_mod)$coefficients["x",] | |
spline_df = data.frame((x_var["Estimate"] + c(-1,0, 1) * qnorm(.975) * x_var["Std. Error"]) |> t()) | |
colnames(spline_df) <- c("spline_lower","spline_estimate", "spline_upper") | |
output_df = cbind(dml_ranger_df, spline_df, linear_df, dml_xgb_df, dml_svm_df) | |
return(c(output_df[1,])) | |
} | |
summary <- data.table() | |
for( i in 1:500 ){ | |
summary <- rbind(summary, get_estimates()) | |
} | |
sum_df <- data.table() | |
tmp <- data.frame(type = rep("ranger", 500), estimate = summary$dml_ranger_estimate) | |
sum_df <- rbind( sum_df, tmp ) | |
tmp <- data.frame(type = rep("svm", 500), estimate = summary$dml_svm_estimate) | |
sum_df <- rbind( sum_df, tmp ) | |
tmp <- data.frame(type = rep("lin", 500), estimate = summary$linear_estimate) | |
sum_df <- rbind( sum_df, tmp ) | |
tmp <- data.frame(type = rep("spline", 500), estimate = summary$spline_estimate) | |
sum_df <- rbind( sum_df, tmp ) | |
tmp <- data.frame(type = rep("xgb", 500), estimate = summary$dml_xgb_estimate) | |
sum_df <- rbind( sum_df, tmp ) | |
library(plyr) | |
mu <- ddply(sum_df, "type", summarise, grp.mean=mean(estimate)) | |
head(mu) | |
p<-ggplot(sum_df, aes(x=estimate, color=type, fill=type)) + | |
geom_density(alpha=0.6)+ | |
geom_vline(data=mu, aes(xintercept=grp.mean, color=type), | |
linetype="dashed") | |
# Continuous colors | |
p + scale_color_brewer(palette="Accent") + | |
theme_minimal()+theme(legend.position="top") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(DoubleML) | |
library(data.table) | |
V1 <- c(1:50) | |
a <- as.data.frame(V1) | |
a[1:50] <- sapply(1:50, "+", a[[1]]) | |
for(i in 1:50){ | |
a[i] <- rnorm(50, 10, 2) | |
} | |
A <- data.table(a) | |
A[,"D" := rbinom(50, 1, 0.4)] | |
# make a little endg for confounding | |
for(i in 1:50){ | |
if(A$V1[i] > 13) { | |
A[i]$D = 1 | |
} | |
if(A$V1[i] < 9) { | |
A[i]$D = 0 | |
} | |
} | |
A[,"Y" := rep(0, 50)] | |
T <- rnorm(50, 20, 1) | |
C <- rnorm(50, 10, 1) | |
for(i in 1:50){ | |
A[i]$Y = ifelse(A[i]$D == 1, T[i], C[i]) + (rowSums(A[i])/100) | |
# making some variables non nuisance | |
A[i]$Y = A[i]$Y + ((A[i]$V2 + A[i]$V3 + A[i]$V4 + A[i]$V5 + A[i]$V6) / 10) | |
A[i]$V10 = A[i]$V10 + ifelse(A[i]$Y > 25, runif(1, 0, 2), runif(1, 10, 20)) | |
} | |
APE_uncond = A[D==1, mean(Y)] - A[D==0, mean(Y)] | |
round(APE_uncond, 2) | |
# Specify the data and variables for the causal model | |
dml_data_bonus = DoubleMLData$new(A, #dataset | |
y_col = "Y", # outcome | |
d_cols = "D", # treat inflated for treat v control | |
x_cols = names(A)[1:50]) # 2-6 not nuisance, 6-50 nuisance | |
print(dml_data_bonus) | |
library(mlr3) | |
library(mlr3learners) | |
# surpress messages from mlr3 package during fitting | |
lgr::get_logger("mlr3")$set_threshold("warn") | |
learner = lrn("regr.ranger", num.trees=500, max.depth=5, min.node.size=2) | |
ml_l_bonus = learner$clone() | |
ml_m_bonus = learner$clone() | |
set.seed(3141) | |
obj_dml_plr_bonus = DoubleMLPLR$new(dml_data_bonus, ml_l=ml_l_bonus, ml_m=ml_m_bonus) | |
obj_dml_plr_bonus$fit() | |
print(obj_dml_plr_bonus) | |
# wow terrible lol | |
lr <- lm("Y ~ as.factor(D)+V1+V2+V3+V4+V5+V6+V7+V8+V9+V10+V11+V12+V13+V14+V15+V16+V17+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V28+V29+V30+V31+V32+V33+V34+V35+V36+V37+V38+V39+V40+V41+V42+V43+V44+V45+V46+V47+V48+V49+V50", A) | |
stargazer::stargazer(lr, type = "text") | |
#install.packages("FactoMineR") | |
library("FactoMineR") | |
#install.packages("corrr") | |
library('corrr') | |
#install.packages("ggcorrplot") | |
library(ggcorrplot) | |
#install.packages("factoextra") | |
library(factoextra) | |
corr_matrix <- cor(A) | |
data.pca <- princomp(corr_matrix) | |
fviz_eig(data.pca, addlabels = TRUE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment