joshuajnoble · April 24, 2023 18:39
diff --git a/partially_linear.r b/partially_linear.r
 transform_data <- function(c, i){
  randnum = runif(1)
  if (round(i/3) == i/3){
    return(cos(c + 1))
  }
  else if (round(i/4) == i/4){
    return(sin(c + 1))
  }
  else if (round(i/2) == i/2){
    return(c^2)
  }
  else {
    return(exp(c)    )        
  }
 }

 generate_data <- function (n, n_confounders = 6, effect_size = 2){
  
  c0 = rnorm(n)
  df = data.frame(c0 = c0) #at least one confounder with a linear effect
  m1 = c0 + rnorm(n)
  m2 = c0 + rnorm(n)
  
  for (i in 1:(n_confounders - 1)){
    c = rnorm(n) 
    m1 = m1 + transform_data(c, i)
    m2 = m2 + transform_data(c, i + 1)
    
    df[,paste0("c", as.character(i))] = c
  }
  
  x = rnorm(n, sd = sd(m1) / 2) + m1 #we want x to be mostly comprised of effects from the confounders
  x = (x - mean(x)) / sd(x) #rescale to be the same as other covariates
  
  y = x * effect_size + m2 + rnorm(n, sd = sd(m2) /10)
  df$x = x
  df$y = y
  return(df)
 }

 get_estimates <- function(n = 500){
  df = generate_data(n)
  
  df_doubleml = DoubleML::double_ml_data_from_data_frame(df, y_col = c("y"), d_cols = c("x"))
  dml_ranger = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.ranger"), mlr3::lrn("regr.ranger"))
  dml_ranger$fit()
  dml_ranger_df = data.frame(dml_ranger$confint())
  colnames(dml_ranger_df) <- c("dml_ranger_lower", "dml_ranger_upper")
  dml_ranger_df$dml_ranger_estimate = dml_ranger$all_coef[1,1]
  
  dml_xgb = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.xgboost"), mlr3::lrn("regr.xgboost"))
  dml_xgb$fit()
  dml_xgb_df = data.frame(dml_xgb$confint())
  colnames(dml_xgb_df) <- c("dml_xgb_lower", "dml_xgb_upper")
  dml_xgb_df$dml_xgb_estimate = dml_xgb$all_coef[1,1]
  
  dml_svm = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.svm"), mlr3::lrn("regr.svm"))
  dml_svm$fit()
  dml_svm_df = data.frame(dml_svm$confint())
  colnames(dml_svm_df) <- c("dml_svm_lower", "dml_svm_upper")
  dml_svm_df$dml_svm_estimate = dml_svm$all_coef[1,1]
  
  x_var = summary(lm(y ~ ., data = df))$coefficients["x",] 
  
  linear_df = data.frame((x_var["Estimate"] + c(-1,0, 1) *qnorm(.975) * x_var["Std. Error"]) |> t())
  colnames(linear_df) <- c("linear_lower", "linear_estimate", "linear_upper")
  
  spline_mod = lm(y ~ x +  splines::ns(c0) + splines::ns(c1, df = 3) + splines::ns(c2, df = 3) + splines::ns(c3, df = 3), data = df)
  
  x_var = summary(spline_mod)$coefficients["x",] 
  spline_df = data.frame((x_var["Estimate"] + c(-1,0, 1) * qnorm(.975) * x_var["Std. Error"]) |> t())
  colnames(spline_df) <- c("spline_lower","spline_estimate", "spline_upper")
  
  output_df = cbind(dml_ranger_df, spline_df, linear_df, dml_xgb_df, dml_svm_df)
  return(c(output_df[1,]))
  
 }

 summary <- data.table()
 for( i in 1:500 ){
  summary <- rbind(summary, get_estimates())
 }

 sum_df <- data.table()

 tmp <- data.frame(type = rep("ranger", 500), estimate = summary$dml_ranger_estimate)
 sum_df <- rbind( sum_df, tmp )

 tmp <- data.frame(type = rep("svm", 500), estimate = summary$dml_svm_estimate)
 sum_df <- rbind( sum_df, tmp )

 tmp <- data.frame(type = rep("lin", 500), estimate = summary$linear_estimate)
 sum_df <- rbind( sum_df, tmp )

 tmp <- data.frame(type = rep("spline", 500), estimate = summary$spline_estimate)
 sum_df <- rbind( sum_df, tmp )

 tmp <- data.frame(type = rep("xgb", 500), estimate = summary$dml_xgb_estimate)
 sum_df <- rbind( sum_df, tmp )

 library(plyr)
 mu <- ddply(sum_df, "type", summarise, grp.mean=mean(estimate))
 head(mu)

 p<-ggplot(sum_df, aes(x=estimate, color=type, fill=type)) +
  geom_density(alpha=0.6)+
  geom_vline(data=mu, aes(xintercept=grp.mean, color=type),
             linetype="dashed")
 # Continuous colors
 p + scale_color_brewer(palette="Accent") + 
  theme_minimal()+theme(legend.position="top")

diff --git a/synth_data_simple.r b/synth_data_simple.r

 library(DoubleML)
 library(data.table)

 V1 <- c(1:50)

 a <- as.data.frame(V1)
 a[1:50] <- sapply(1:50, "+", a[[1]])

 for(i in 1:50){
  a[i] <- rnorm(50, 10, 2)
 }

 A <- data.table(a)

 A[,"D" := rbinom(50, 1, 0.4)]

 # make a little endg for confounding
 for(i in 1:50){
  if(A$V1[i] > 13) {
    A[i]$D = 1
  }
  if(A$V1[i] < 9) {
    A[i]$D = 0
  }
 }

 A[,"Y" := rep(0, 50)]

 T <- rnorm(50, 20, 1)
 C <- rnorm(50, 10, 1)

 for(i in 1:50){
  A[i]$Y = ifelse(A[i]$D == 1, T[i], C[i]) + (rowSums(A[i])/100)
  
  # making some variables non nuisance
  A[i]$Y = A[i]$Y + ((A[i]$V2 + A[i]$V3 + A[i]$V4 + A[i]$V5 + A[i]$V6) / 10)
  
  A[i]$V10 = A[i]$V10 + ifelse(A[i]$Y > 25, runif(1, 0, 2), runif(1, 10, 20))
 }

 APE_uncond = A[D==1, mean(Y)] - A[D==0, mean(Y)]
 round(APE_uncond, 2)

 # Specify the data and variables for the causal model
 dml_data_bonus = DoubleMLData$new(A, #dataset
                                  y_col = "Y", # outcome
                                  d_cols = "D", # treat inflated for treat v control
                                  x_cols = names(A)[1:50]) # 2-6 not nuisance, 6-50 nuisance


 print(dml_data_bonus)

 library(mlr3)
 library(mlr3learners)
 # surpress messages from mlr3 package during fitting
 lgr::get_logger("mlr3")$set_threshold("warn")

 learner = lrn("regr.ranger", num.trees=500, max.depth=5, min.node.size=2)
 ml_l_bonus = learner$clone()
 ml_m_bonus = learner$clone()

 set.seed(3141)
 obj_dml_plr_bonus = DoubleMLPLR$new(dml_data_bonus, ml_l=ml_l_bonus, ml_m=ml_m_bonus)
 obj_dml_plr_bonus$fit()
 print(obj_dml_plr_bonus)

 # wow terrible lol
 lr <- lm("Y ~ as.factor(D)+V1+V2+V3+V4+V5+V6+V7+V8+V9+V10+V11+V12+V13+V14+V15+V16+V17+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V28+V29+V30+V31+V32+V33+V34+V35+V36+V37+V38+V39+V40+V41+V42+V43+V44+V45+V46+V47+V48+V49+V50", A)
 stargazer::stargazer(lr, type = "text")

 #install.packages("FactoMineR")
 library("FactoMineR")
 #install.packages("corrr")
 library('corrr')
 #install.packages("ggcorrplot")
 library(ggcorrplot)

 #install.packages("factoextra")
 library(factoextra)

 corr_matrix <- cor(A)
 data.pca <- princomp(corr_matrix)

 fviz_eig(data.pca, addlabels = TRUE)
	transform_data <- function(c, i){
	randnum = runif(1)
	if (round(i/3) == i/3){
	return(cos(c + 1))
	}
	else if (round(i/4) == i/4){
	return(sin(c + 1))
	}
	else if (round(i/2) == i/2){
	return(c^2)
	}
	else {
	return(exp(c) )
	}
	}

	generate_data <- function (n, n_confounders = 6, effect_size = 2){

	c0 = rnorm(n)
	df = data.frame(c0 = c0) #at least one confounder with a linear effect
	m1 = c0 + rnorm(n)
	m2 = c0 + rnorm(n)

	for (i in 1:(n_confounders - 1)){
	c = rnorm(n)
	m1 = m1 + transform_data(c, i)
	m2 = m2 + transform_data(c, i + 1)

	df[,paste0("c", as.character(i))] = c
	}

	x = rnorm(n, sd = sd(m1) / 2) + m1 #we want x to be mostly comprised of effects from the confounders
	x = (x - mean(x)) / sd(x) #rescale to be the same as other covariates

	y = x * effect_size + m2 + rnorm(n, sd = sd(m2) /10)
	df$x = x
	df$y = y
	return(df)
	}

	get_estimates <- function(n = 500){
	df = generate_data(n)

	df_doubleml = DoubleML::double_ml_data_from_data_frame(df, y_col = c("y"), d_cols = c("x"))
	dml_ranger = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.ranger"), mlr3::lrn("regr.ranger"))
	dml_ranger$fit()
	dml_ranger_df = data.frame(dml_ranger$confint())
	colnames(dml_ranger_df) <- c("dml_ranger_lower", "dml_ranger_upper")
	dml_ranger_df$dml_ranger_estimate = dml_ranger$all_coef[1,1]

	dml_xgb = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.xgboost"), mlr3::lrn("regr.xgboost"))
	dml_xgb$fit()
	dml_xgb_df = data.frame(dml_xgb$confint())
	colnames(dml_xgb_df) <- c("dml_xgb_lower", "dml_xgb_upper")
	dml_xgb_df$dml_xgb_estimate = dml_xgb$all_coef[1,1]

	dml_svm = DoubleML::DoubleMLPLR$new(df_doubleml, mlr3::lrn("regr.svm"), mlr3::lrn("regr.svm"))
	dml_svm$fit()
	dml_svm_df = data.frame(dml_svm$confint())
	colnames(dml_svm_df) <- c("dml_svm_lower", "dml_svm_upper")
	dml_svm_df$dml_svm_estimate = dml_svm$all_coef[1,1]

	x_var = summary(lm(y ~ ., data = df))$coefficients["x",]

	linear_df = data.frame((x_var["Estimate"] + c(-1,0, 1) qnorm(.975) x_var["Std. Error"]) \|> t())
	colnames(linear_df) <- c("linear_lower", "linear_estimate", "linear_upper")

	spline_mod = lm(y ~ x + splines::ns(c0) + splines::ns(c1, df = 3) + splines::ns(c2, df = 3) + splines::ns(c3, df = 3), data = df)

	x_var = summary(spline_mod)$coefficients["x",]
	spline_df = data.frame((x_var["Estimate"] + c(-1,0, 1) * qnorm(.975) * x_var["Std. Error"]) \|> t())
	colnames(spline_df) <- c("spline_lower","spline_estimate", "spline_upper")

	output_df = cbind(dml_ranger_df, spline_df, linear_df, dml_xgb_df, dml_svm_df)
	return(c(output_df[1,]))

	}

	summary <- data.table()
	for( i in 1:500 ){
	summary <- rbind(summary, get_estimates())
	}

	sum_df <- data.table()

	tmp <- data.frame(type = rep("ranger", 500), estimate = summary$dml_ranger_estimate)
	sum_df <- rbind( sum_df, tmp )

	tmp <- data.frame(type = rep("svm", 500), estimate = summary$dml_svm_estimate)
	sum_df <- rbind( sum_df, tmp )

	tmp <- data.frame(type = rep("lin", 500), estimate = summary$linear_estimate)
	sum_df <- rbind( sum_df, tmp )

	tmp <- data.frame(type = rep("spline", 500), estimate = summary$spline_estimate)
	sum_df <- rbind( sum_df, tmp )

	tmp <- data.frame(type = rep("xgb", 500), estimate = summary$dml_xgb_estimate)
	sum_df <- rbind( sum_df, tmp )

	library(plyr)
	mu <- ddply(sum_df, "type", summarise, grp.mean=mean(estimate))
	head(mu)

	p<-ggplot(sum_df, aes(x=estimate, color=type, fill=type)) +
	geom_density(alpha=0.6)+
	geom_vline(data=mu, aes(xintercept=grp.mean, color=type),
	linetype="dashed")
	# Continuous colors
	p + scale_color_brewer(palette="Accent") +
	theme_minimal()+theme(legend.position="top")

	library(DoubleML)
	library(data.table)

	V1 <- c(1:50)

	a <- as.data.frame(V1)
	a[1:50] <- sapply(1:50, "+", a[[1]])

	for(i in 1:50){
	a[i] <- rnorm(50, 10, 2)
	}

	A <- data.table(a)

	A[,"D" := rbinom(50, 1, 0.4)]

	# make a little endg for confounding
	for(i in 1:50){
	if(A$V1[i] > 13) {
	A[i]$D = 1
	}
	if(A$V1[i] < 9) {
	A[i]$D = 0
	}
	}

	A[,"Y" := rep(0, 50)]

	T <- rnorm(50, 20, 1)
	C <- rnorm(50, 10, 1)

	for(i in 1:50){
	A[i]$Y = ifelse(A[i]$D == 1, T[i], C[i]) + (rowSums(A[i])/100)

	# making some variables non nuisance
	A[i]$Y = A[i]$Y + ((A[i]$V2 + A[i]$V3 + A[i]$V4 + A[i]$V5 + A[i]$V6) / 10)

	A[i]$V10 = A[i]$V10 + ifelse(A[i]$Y > 25, runif(1, 0, 2), runif(1, 10, 20))
	}

	APE_uncond = A[D==1, mean(Y)] - A[D==0, mean(Y)]
	round(APE_uncond, 2)

	# Specify the data and variables for the causal model
	dml_data_bonus = DoubleMLData$new(A, #dataset
	y_col = "Y", # outcome
	d_cols = "D", # treat inflated for treat v control
	x_cols = names(A)[1:50]) # 2-6 not nuisance, 6-50 nuisance


	print(dml_data_bonus)

	library(mlr3)
	library(mlr3learners)
	# surpress messages from mlr3 package during fitting
	lgr::get_logger("mlr3")$set_threshold("warn")

	learner = lrn("regr.ranger", num.trees=500, max.depth=5, min.node.size=2)
	ml_l_bonus = learner$clone()
	ml_m_bonus = learner$clone()

	set.seed(3141)
	obj_dml_plr_bonus = DoubleMLPLR$new(dml_data_bonus, ml_l=ml_l_bonus, ml_m=ml_m_bonus)
	obj_dml_plr_bonus$fit()
	print(obj_dml_plr_bonus)

	# wow terrible lol
	lr <- lm("Y ~ as.factor(D)+V1+V2+V3+V4+V5+V6+V7+V8+V9+V10+V11+V12+V13+V14+V15+V16+V17+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V28+V29+V30+V31+V32+V33+V34+V35+V36+V37+V38+V39+V40+V41+V42+V43+V44+V45+V46+V47+V48+V49+V50", A)
	stargazer::stargazer(lr, type = "text")

	#install.packages("FactoMineR")
	library("FactoMineR")
	#install.packages("corrr")
	library('corrr')
	#install.packages("ggcorrplot")
	library(ggcorrplot)

	#install.packages("factoextra")
	library(factoextra)

	corr_matrix <- cor(A)
	data.pca <- princomp(corr_matrix)

	fviz_eig(data.pca, addlabels = TRUE)