Skip to content

Instantly share code, notes, and snippets.

@joshuajnoble
Last active April 24, 2023 18:39
Show Gist options
  • Save joshuajnoble/c3f6e172fc2dcd3204530827b4e7c3fe to your computer and use it in GitHub Desktop.
Save joshuajnoble/c3f6e172fc2dcd3204530827b4e7c3fe to your computer and use it in GitHub Desktop.
Causality Reading : Double ML
transform_data <- function(c, i){
randnum = runif(1)
if (round(i/3) == i/3){
return(cos(c + 1))
}
else if (round(i/4) == i/4){
return(sin(c + 1))
}
else if (round(i/2) == i/2){
return(c^2)
}
else {
return(exp(c) )
}
}
generate_data <- function (n, n_confounders = 6, effect_size = 2){
c0 = rnorm(n)
df = data.frame(c0 = c0) #at least one confounder with a linear effect
m1 = c0 + rnorm(n)
m2 = c0 + rnorm(n)
for (i in 1:(n_confounders - 1)){
c = rnorm(n)
m1 = m1 + transform_data(c, i)
m2 = m2 + transform_data(c, i + 1)
df[,paste0("c", as.character(i))] = c
}
x = rnorm(n, sd = sd(m1) / 2) + m1 #we want x to be mostly comprised of effects from the confounders
x = (x - mean(x)) / sd(x) #rescale to be the same as other covariates
y = x * effect_size + m2 + rnorm(n, sd = sd(m2) /10)
df$x = x
df$y = y
return(df)
}
get_estimates <- function(n = 500){
df = generate_data(n)
df_doubleml = DoubleML::double_ml_data_from_data_frame(df, y_col = c("y"), d_cols = c("x"))
dml_ranger = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.ranger"), mlr3::lrn("regr.ranger"))
dml_ranger$fit()
dml_ranger_df = data.frame(dml_ranger$confint())
colnames(dml_ranger_df) <- c("dml_ranger_lower", "dml_ranger_upper")
dml_ranger_df$dml_ranger_estimate = dml_ranger$all_coef[1,1]
dml_xgb = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.xgboost"), mlr3::lrn("regr.xgboost"))
dml_xgb$fit()
dml_xgb_df = data.frame(dml_xgb$confint())
colnames(dml_xgb_df) <- c("dml_xgb_lower", "dml_xgb_upper")
dml_xgb_df$dml_xgb_estimate = dml_xgb$all_coef[1,1]
dml_svm = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.svm"), mlr3::lrn("regr.svm"))
dml_svm$fit()
dml_svm_df = data.frame(dml_svm$confint())
colnames(dml_svm_df) <- c("dml_svm_lower", "dml_svm_upper")
dml_svm_df$dml_svm_estimate = dml_svm$all_coef[1,1]
x_var = summary(lm(y ~ ., data = df))$coefficients["x",]
linear_df = data.frame((x_var["Estimate"] + c(-1,0, 1) *qnorm(.975) * x_var["Std. Error"]) |> t())
colnames(linear_df) <- c("linear_lower", "linear_estimate", "linear_upper")
spline_mod = lm(y ~ x + splines::ns(c0) + splines::ns(c1, df = 3) + splines::ns(c2, df = 3) + splines::ns(c3, df = 3), data = df)
x_var = summary(spline_mod)$coefficients["x",]
spline_df = data.frame((x_var["Estimate"] + c(-1,0, 1) * qnorm(.975) * x_var["Std. Error"]) |> t())
colnames(spline_df) <- c("spline_lower","spline_estimate", "spline_upper")
output_df = cbind(dml_ranger_df, spline_df, linear_df, dml_xgb_df, dml_svm_df)
return(c(output_df[1,]))
}
summary <- data.table()
for( i in 1:500 ){
summary <- rbind(summary, get_estimates())
}
sum_df <- data.table()
tmp <- data.frame(type = rep("ranger", 500), estimate = summary$dml_ranger_estimate)
sum_df <- rbind( sum_df, tmp )
tmp <- data.frame(type = rep("svm", 500), estimate = summary$dml_svm_estimate)
sum_df <- rbind( sum_df, tmp )
tmp <- data.frame(type = rep("lin", 500), estimate = summary$linear_estimate)
sum_df <- rbind( sum_df, tmp )
tmp <- data.frame(type = rep("spline", 500), estimate = summary$spline_estimate)
sum_df <- rbind( sum_df, tmp )
tmp <- data.frame(type = rep("xgb", 500), estimate = summary$dml_xgb_estimate)
sum_df <- rbind( sum_df, tmp )
library(plyr)
mu <- ddply(sum_df, "type", summarise, grp.mean=mean(estimate))
head(mu)
p<-ggplot(sum_df, aes(x=estimate, color=type, fill=type)) +
geom_density(alpha=0.6)+
geom_vline(data=mu, aes(xintercept=grp.mean, color=type),
linetype="dashed")
# Continuous colors
p + scale_color_brewer(palette="Accent") +
theme_minimal()+theme(legend.position="top")
library(DoubleML)
library(data.table)
V1 <- c(1:50)
a <- as.data.frame(V1)
a[1:50] <- sapply(1:50, "+", a[[1]])
for(i in 1:50){
a[i] <- rnorm(50, 10, 2)
}
A <- data.table(a)
A[,"D" := rbinom(50, 1, 0.4)]
# make a little endg for confounding
for(i in 1:50){
if(A$V1[i] > 13) {
A[i]$D = 1
}
if(A$V1[i] < 9) {
A[i]$D = 0
}
}
A[,"Y" := rep(0, 50)]
T <- rnorm(50, 20, 1)
C <- rnorm(50, 10, 1)
for(i in 1:50){
A[i]$Y = ifelse(A[i]$D == 1, T[i], C[i]) + (rowSums(A[i])/100)
# making some variables non nuisance
A[i]$Y = A[i]$Y + ((A[i]$V2 + A[i]$V3 + A[i]$V4 + A[i]$V5 + A[i]$V6) / 10)
A[i]$V10 = A[i]$V10 + ifelse(A[i]$Y > 25, runif(1, 0, 2), runif(1, 10, 20))
}
APE_uncond = A[D==1, mean(Y)] - A[D==0, mean(Y)]
round(APE_uncond, 2)
# Specify the data and variables for the causal model
dml_data_bonus = DoubleMLData$new(A, #dataset
y_col = "Y", # outcome
d_cols = "D", # treat inflated for treat v control
x_cols = names(A)[1:50]) # 2-6 not nuisance, 6-50 nuisance
print(dml_data_bonus)
library(mlr3)
library(mlr3learners)
# surpress messages from mlr3 package during fitting
lgr::get_logger("mlr3")$set_threshold("warn")
learner = lrn("regr.ranger", num.trees=500, max.depth=5, min.node.size=2)
ml_l_bonus = learner$clone()
ml_m_bonus = learner$clone()
set.seed(3141)
obj_dml_plr_bonus = DoubleMLPLR$new(dml_data_bonus, ml_l=ml_l_bonus, ml_m=ml_m_bonus)
obj_dml_plr_bonus$fit()
print(obj_dml_plr_bonus)
# wow terrible lol
lr <- lm("Y ~ as.factor(D)+V1+V2+V3+V4+V5+V6+V7+V8+V9+V10+V11+V12+V13+V14+V15+V16+V17+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V28+V29+V30+V31+V32+V33+V34+V35+V36+V37+V38+V39+V40+V41+V42+V43+V44+V45+V46+V47+V48+V49+V50", A)
stargazer::stargazer(lr, type = "text")
#install.packages("FactoMineR")
library("FactoMineR")
#install.packages("corrr")
library('corrr')
#install.packages("ggcorrplot")
library(ggcorrplot)
#install.packages("factoextra")
library(factoextra)
corr_matrix <- cor(A)
data.pca <- princomp(corr_matrix)
fviz_eig(data.pca, addlabels = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment