viniciusmss · February 23, 2020 18:40
diff --git a/CS112-Assignment 2 Solutions (Spring 2019).R b/CS112-Assignment 2 Solutions (Spring 2019).R
 ##############
 # QUESTION 1: 

 # a) replicating figure 9.7 (right panel) from Gelman et. al. 

 # separating treatment and control data
 foo = read.csv("https://tinyurl.com/wlgl63b")
 ctrl_units = foo[which(foo$treatment==0),]
 trt_units = foo[which(foo$treatment==1),]

 # fit regression models for both groups
 lm_ctrl = lm(post.test ~ pre.test, data=ctrl_units)
 lm_trt = lm(post.test ~ pre.test, data=trt_units)

 # plot the data and the regression lines
 plot(x=c(0:125),y=c(0:125),type='n', xlab='Pre-test', ylab='Post-test') #following the original figure's layout (all points in the corner)
 #plot(x=c(70:125),y=c(70:125),type='n', xlab='Pre-test', ylab='Post-test') #a clearer version
 points(ctrl_units$pre.test,ctrl_units$post.test,pch=16)
 points(trt_units$pre.test,trt_units$post.test,pch=1)
 legend(x=0,y=125,legend=c('Treatment','Control'), lty=1:2)
 title('Grade 4 Post vs Pre test scores')
 abline(lm_trt)
 abline(lm_ctrl,lty=2)

 # b) changing the value of one datum to change the treatment effect

 trt_units[11,'post.test'] = 70
 lm_trt_b = lm(post.test ~ pre.test, data=trt_units)

 #plot the data and the regression lines
 #plot(x=c(0:125),y=c(0:125),type='n', xlab='Pre-test', ylab='Post-test') #following the original figure's layout (all points in the corner)
 plot(x=c(70:125),y=c(70:125),type='n', xlab='Pre-test', ylab='Post-test') #a clearer version
 points(ctrl_units$pre.test,ctrl_units$post.test,pch=16)
 points(trt_units$pre.test,trt_units$post.test,pch=1)
 points(104.1,70,col='red',pch=0, cex=2) #highlighting the modified point
 title('Grade 4 Post vs Pre test scores - One observation changed')
 legend(x=70,y=125,legend=c('Treatment','Control'), lty=1:2)
 abline(lm_trt_b)
 abline(lm_ctrl,lty=2)

 # c) replicate figure 9.8

 library(arm)
 lm_c = lm(post.test ~ treatment + pre.test +treatment:pre.test,data=foo)
 lm_c_sim = sim(lm_c, n.sims=20)
 plot (0, 0, xlim=range(foo$pre.test), ylim=c(-5,15),
      xlab="Pre-test", ylab="Treatment effect",
      main="Treatment effect for Grade 4")
 abline (0, 0, lwd=.5, lty=2)
 x = range(foo$pre.test)
 for (i in 1:20){
  curve(lm_c_sim@coef[i,2] + lm_c_sim@coef[i,4]*x, lwd=.5, col="gray",
         add=TRUE)}
 curve(coef(lm_c)[2] + coef(lm_c)[4]*x, lwd=.5, add=TRUE)
 legend(x=105,y=15,legend=c('Main regression','Simulated Results'), lty=1,col=c('black','grey'))

 # 95% confidence interval for the treatment effect
 pre_test = c(78:120)
 num_sims=100
 storage_matrix = matrix(NA, nrow=length(pre_test), ncol=num_sims)
 lm_c_sim = sim(lm_c,n.sims=num_sims)
 for (score in pre_test){
  for (i in 1:num_sims){
    storage_matrix[score-77,i] = lm_c_sim@coef[i,2] + lm_c_sim@coef[i,4]*score
  }
 }
 conf.int <- apply(storage_matrix, 1, quantile, probs = c(0.025, 0.975))
 lines(pre_test,conf.int[1,],lwd=0.5,col='red')
 lines(pre_test,conf.int[2,],lwd=0.5,col='red')
 legend(x=105,y=15,legend=c('Main regression','Simulated Results', '95% CI'), lty=1,col=c('black','grey', 'red'))

 # The uncertainty around the treatment effect is lowest around the x-values of 100-110
 # Since this is where most observation lie, so most instances of simulated regression 
 # lines will be pulled towards the data in this area, decreasing the variation.

 ##############
 # QUESTION 2:
 # Using the “tinting” data set in the “DAAG” package, fit a linear model that predicts csoa as a function of age, sex, target, 
 # and I(tint != "no"), and I(tint!= "no"*age). Estimate:
 ##############

 library(arm)
 tinting = read.csv(url("https://tinyurl.com/v4bq99k"))

 # a) the 95% interval of expected values for csoa, for typical treated female units, with target == “hicon”, ages 20, 30, 40, 
 # 50, 60, 70, and 80 using simulation (i.e., 1000 simulated predictions for every row from 1000 sets of coefficients).  
 # In this data set, a “treated unit” is a unit for which tint != “no” is TRUE.

 # Fit the regression model 
 lm_tint <-
  lm(csoa ~ age + sex + target + I(as.numeric(tint != "no")) + I(as.numeric(tint!= "no")*age), data = tinting) 
 summary(lm_tint)

 # Simulate 
 iterations <- 1000
 sim <- sim(lm_tint, n.sims = iterations)

 # Predict csoa for every unit of age
 ages = c(20, 30, 40, 50, 60, 70, 80)
 simulated.trt <- matrix(NA, nrow = iterations, length(ages))

 for (age in ages) {
  Xs <- c(1, age, 0, 1, 1, 1*age) # 1 = intercept, age = age, 0 = sex(f), 1 = target(hicon), 1 = tint!='no' (treatment)
  
  for (i in 1:iterations) {
    simulated.trt[i, 1 + (age-min(ages))/10] <- sum(Xs*sim@coef[i,]) 
  }
 }

 # Table with the relevant point estimates (bounds of the prediction intervals of y for different ages)
 conf.intervals.trt <- apply(simulated.trt, 2, quantile, probs = c(0.025, 0.975))
 mean.trt <- apply(simulated.trt, 2, mean)
 table.mean.trt <- t(data.frame(conf.intervals.trt))
 colnames(table.mean.trt) <- c("Mean CI Lower Bound", "Mean CI Upper Bound")
 rownames(table.mean.trt) <- ages
 View(table.mean.trt)

 # Plot the predicted csoa (y-axis) against age (x-axis)
 plot(x = c(1:100), y = c(1:100), type = "n", 
     xlim = c(min(ages),max(ages)), 
     ylim = c(0,100), 
     main = "csoa by age for typical treated female units", xlab = "age", 
     ylab = "csoa")

 for (age in ages) {
  segments(
    x0 = age,
    y0 = conf.intervals.trt[1, 1 + (age-min(ages))/10],
    x1 = age,
    y1 = conf.intervals.trt[2, 1 + (age-min(ages))/10],
    lwd = 1)
  points(ages, mean.trt, col = "red", pch=16)
 }

 # b) the 95% prediction interval for the treatment effect, for typical female units. Please follow the instructions in (a), 
 # using simulation.

 # Predict csoa for every unit of age
 ages = c(20, 30, 40, 50, 60, 70, 80)
 simulated.ctrl <- matrix(NA, nrow = iterations, ncol = length(ages))

 for (age in ages) {
  Xs <- c(1, age, 0, 1, 1, 0*age) # 1 = intercept, age = age, 0 = sex(f), 1 = target(hicon), 1 = tint!='no', 0 = control unit
  
  for (i in 1:iterations) {
    simulated.ctrl[i, 1 + (age-min(ages))/10] <- sum(Xs*sim@coef[i,])
  }
 }

 simulated.trt.effect = simulated.trt - simulated.ctrl

 # Table with the relevant point estimates (bounds of the prediction intervals of y for different ages)
 conf.intervals.trt.effect <- apply(simulated.trt.effect, 2, quantile, probs = c(0.025, 0.975))
 mean.trt.effect <- apply(simulated.trt.effect, 2, mean)
 table.mean.trt.effect <- t(data.frame(conf.intervals.trt.effect))
 colnames(table.mean.trt.effect) <- c("Mean CI Lower Bound", "Mean CI Upper Bound")
 rownames(table.mean.trt.effect) <- ages
 View(table.mean.trt.effect)

 # Plot the predicted csoa (y-axis) against age (x-axis)
 plot(x = c(1:100), y = c(1:100), type = "n", 
     xlim = c(min(ages),max(ages)), 
     ylim = c(-20,20),
     main = "treatment effect by age for typical female units", xlab = "age", 
     ylab = "treatment effect")

 for (age in ages) {
  segments(
    x0 = age,
    y0 = conf.intervals.trt.effect[1, 1 + (age-min(ages))/10],
    x1 = age,
    y1 = conf.intervals.trt.effect[2, 1 + (age-min(ages))/10],
    lwd = 1)
  points(ages, mean.trt.effect, col = "red", pch=16)
 }

 ##############
 # QUESTION 3:
 # Write your own function (8 lines max) that takes Ys and predicted Ys as inputs, and outputs R2. 
 # Copy/paste an example using the lalonde data (from the Matching library) that shows your code is working correctly -- 
 # (i.e., show that the summary(lm()) command outputs a very similar R2 as your own function does.
 ##############

 # R squared function with bootstrapping
 rsquared <- function(ytrue, ypred) {
  res <- sum((ytrue - ypred)**2)
  tot <- sum((ytrue - mean(ytrue))**2)
  return(1-res/tot)
 }

 # Apply function to lalonde dataset
 library(Matching)
 data(lalonde)

 lm.test = lm(age ~ educ, data = lalonde)
 rsquared(lalonde$age, lm.test$fitted.values)
 summary(lm.test)$r.sq


 ##############
 # QUESTION 4:
 # Bootstrap the 95% confidence interval of the treatment effect using the mazedata1.dta
 # data (define the binary treatment condition as “Caste Revealed” vs. not “Caste 
 # Revealed, and define outcomes as “round1”. No covariates (independent variables) are
 # needed. Use 10000 bootstrap samples.

 #Don’t use a ‘canned’ bootstrap function -- please code the bootstrap routine manually.

 #Then, obtain the a conventional confidence interval for the treatment effect using the
 #“confint(lm())” function.
 ##############

 library(foreign)

 # Load the dta file from your local folder
 foo4 = read.dta("C:/Users/tomkr/Documents/Minerva/Work Study/Buenos Aires/mazedata1.dta")

 # Create a new treatment indicator for Caste Revealed = Treatment, the rest in Control
 new_trt = rep(0,nrow(foo4))
 new_trt[which(foo4$treatment=='Caste Revealed')] = 1
 foo4$trt = new_trt

 # Bootstrapping function
 iterations <- 10000
 storage <- rep(NA, iterations)
 for (i in 1:iterations) {
  # Fit a linear model using the outcome variable and the newly-defined treatment indicator
  lm.maze = lm(round1 ~ trt, data = foo4[sample(1:nrow(foo4), nrow(foo4), replace = T),])
  storage[i] <- lm.maze$coefficients[2]
 }

 # Find simulated confidence interval
 quantile(storage, c(0.025, 0.975))

 # Find analytical confidence interval
 lm.maze = lm(round1 ~ trt, data = foo4)
 confint(lm.maze)[2,]

 # Visualize simulated and analytical CI with table and histogram

 # a) A table with the relevant results (bounds on the 2 confidence intervals).
 data.frame("simulation" = quantile(storage, c(0.025, 0.975)), "analytical" = confint(lm.maze)[2,])

 # b) 1 histogram (properly labeled) showing your bootstrap-sample results. How you do this one is up to you.
 hist(storage, main = "Bootstrapped Values for the Treatment Effect", 
     xlab = "Treatment Effect", ylab = "Count")
 abline(v=quantile(storage, c(0.025, 0.975)), col="red")
 abline(v=confint(lm.maze)[2,], col="blue")
 legend(0.5,1500, legend=c("bootstrap 95% confidence interval","analytical 95% confidence interval"),
       col=c("red","blue"), lty=1:1, cex=0.5)

 # c) No more than 3 sentences summarizing the results and drawing any conclusions you find relevant and interesting.

 # Sample response: Coefficient of the treatment variable in the linear model indicates the treatment effect, ~ -0.132. 
 # The 95% confidence interval ranges from ~ -0.584 to ~ 0.319, which means we lack strong evidence about the direction
 # of the treatment effect. Analytical and simulation approaches yielded similar CIs, which provided extra confidence in this conclusion. 

 ##############
 # QUESTION 5:
 # Obtain the nsw.dta dataset from: foo <- read.csv(url("https://tinyurl.com/yx8tqf3k")). 
 # Randomly remove 2000 observations, and set it aside as a test set (or validation set).
 # Use the remainder of the dataset to estimate 2 models of your choice of varying degrees of complexity 
 # (in terms of the number of variables and/or their interactions), with the dependent variable being “treat”. 
 # NOTE: re78 is not a predictor because it postdates the treatment. (In other words, it’s an outcome.) 

 # Use LOOCV to estimate the test-set error (MSE) for each of your models. Then compare these LOOCV test-set error 
 # estimates to the test-set error you obtain when you actually run your model on your 2000-row test set.

 # Provide a table that summarizes the 4 different sets of numbers (2 LOOCV estimates, and 2 test set error rates). 
 # Explain your results in no more than a paragraph.
 ##############

 # Taking dataset and subsetting dataset into training and testing dataset
 foo <- read.csv(url("https://tinyurl.com/yx8tqf3k"))
 set.seed(12345)
 test_set_rows <- sample(1:length(foo$age), 2000, replace = FALSE) 
 test_foo = foo[test_set_rows,]
 train_foo = foo[-test_set_rows,]

 # 2 models of varying complexity
 glm1 = glm(treat ~ age, data = train_foo)
 glm2 = glm(treat ~ . -re78 + age*education + education*nodegree + married*re74 + hispanic*re75, data = train_foo)

 # Calculate MSE using test set
 MSE1 = mean((test_foo$treat - predict.lm(glm1, test_foo))**2)
 MSE2 = mean((test_foo$treat - predict.lm(glm2, test_foo))**2)

 # Estimate MSE using LOOCV
 library(boot)
 LOOCV1 = cv.glm(train_foo, glm1)$delta[1]
 LOOCV2 = cv.glm(train_foo, glm2)$delta[1]
 TENCV1 = cv.glm(train_foo, glm1, K = 10)$delta[1]
 TENCV2 = cv.glm(train_foo, glm2, K = 10)$delta[1]


 # Table that summarizes the 5 LOOCV estimates and 5 test set MSEs
 data.frame("Test set MSE" = c(MSE1, MSE2), "LOOCV MSE" = c(LOOCV1, LOOCV2), "TENCV MSE" = c(TENCV1, TENCV2))

 #               Test set MSE     LOOCV MSE    TENCV
 # simple model  0.009836669     0.011449698   0.0114505923
 # complex model 0.008975607     0.009969941   0.0099679393

 # Sample response: Comparing the test set MSEs and the LOOCV-approximated MSEs, we see that LOOCV does a decent approximation
 # of the MSE (~0.002 difference). LOOCV did overapproximate the MSE in both cases. However, it may be okay in practice, because
 # it doesn't give the impression that the model is better than it actually is. The trade-off of using LOOCV is that it is very
 # computationally and temporally expensive (takes a lot of memory and time to run). Additionally, the more complex model
 # gives a better MSE and LOOCV value. 


 ##############
 # QUESTION 6: 
 # Should briefly describe the the purpose of the
 # study, the methodology, and the policy implications/prescription.

 # Sample response:  
 # This memo presents the results from the Anti-Binge Consumption (ABC) experiment that 
 # tested a proposed program to reduce alcohol drinking among young adults aged 18 to 30.
 # Participants were sampled at several regional community centers and colleges and were 
 # randomly assigned to either a series of lectures on Giant Pandas conservation (control, 
 # n=108 or short one-on-one conversations with current and ex-alcohol addicts (treatment,
 # n=112). Both groups reported their dietary habits, including weekly alcohol consumption, 
 # before and a month after the program. The results show that while before the experiment 
 # weekly alcohol consumption was similar, post-experiment decrease in consumption was either 
 # conclusive or not depending on age group. Consumption was statistically significantly lower 
 # for 18-year-olds (95% intervals of 0-2 vs 3-10 drinks for treated and controls groups, respectively),
 # borderline significant for 20-year-olds and inconclusive for ages 22-26. 
 # Consumption of treatment participants was rougly around the same range (roughly 0-4) for all 
 # ages while among control participants it got lower with age increase such that the differences
 # diminished around ages 28-30. Thus, the results support administering the program to adults 
 # aged 18-20, whereas more research is necessary to understand the effects on the 22-26 age group. 


 ##############
 # QUESTION 7 (Optional):
 # Figure out how to use the rgenoud genetic algorithm (you’ll have to install the rgenoud library) to run a least-squares 
 # regression. You’ll have to write a function that calculates the sum of the squared residuals, and then you will feed that 
 # function to rgenoud. Demonstrate that your genetic algorithm obtains the same (or very nearly the same) coefficients as 
 # using the lm function. 

 # Then, once you’ve established that your code is working correctly, write another function to run a least-absolute-deviations
 # regression (in which the genetic algorithm identifies the linear function that produces the smallest sum of ABSOLUTE 
 # residuals). NOTE: You must add an argument “gradient.check = FALSE” to the genetic algorithm to ensure that it runs properly 
 # (You are not expected to know or explain why).

 # Comment on the difference between the output (slope and intercept) of the two functions. Is one function more sensitive 
 # to the existing outlier?
 ##############

 library(rgenoud)

 data_outlier = read.csv(url("https://tinyurl.com/vfsoe4q"))

 # Function to identify coefs that minimize the sum of squared residuals: 

 # We're going to assume the regression of interest is:
 # Y ~ intercept + slope*X

 SSR <- function(coef, y, x) {
  y = data_outlier$y
  x = data_outlier$x
  # for a given set of coefs, the function calculates the sum of squared residuals
  intercept = coef[1]
  slope = coef[2]
  return(sum((y - intercept - slope*x)**2))
 }

 # Rgenoud genetic algorithm to minimize SSR
 GA_1 = genoud(SSR, nvars = 2, pop.size=1000, max.generations=100, wait.generations=10)

 #Parameters at the Solution (parameter, gradient):
 #  
 #  X[ 1] :	1.893287e-02	G[ 1] :	1.646590e-09
 #  X[ 2] :	4.628064e-01	G[ 2] :	-1.179106e-09

 # Simple regression
 lm = lm(y ~ x, data = data_outlier)
 summary(lm)

 # Coefficients:
 #              Estimate  Std. Error  t value   Pr(>|t|)
 # (Intercept)  0.01893    0.11321    0.167     0.867523
 # x            0.46281    0.12273    3.771     0.000277 ***


 SSAR <- function(coef, y, x) {
  y = data_outlier$y
  x = data_outlier$x
  # for a given set of coefs, the function calculates the sum of absolute residuals
  intercept = coef[1]
  slope = coef[2]
  return(sum(abs(y - intercept - slope*x)))
 }

 GA_2 = genoud(SSAR, nvars = 2, pop.size=1000, max.generations=100, wait.generations=10, gradient.check = FALSE)

 #Parameters at the Solution (parameter, gradient):
 #
 #  X[ 1] :	-1.781644e-01	G[ 1] :	1.000000e+00
 #  X[ 2] :	3.543404e-01	G[ 2] :	3.829967e-01

 # Sample response: Comparing the slopes that each optimizer generated with their respective loss functions, we observe that
 # the SSR function has a more positive slope. This is because the SSR cost function "punishes" outliers more than the SSAR 
 # function. Thus, the slope will change more drastically to the presence of outliers to minimize the cost function.
	##############
	# QUESTION 1:

	# a) replicating figure 9.7 (right panel) from Gelman et. al.

	# separating treatment and control data
	foo = read.csv("https://tinyurl.com/wlgl63b")
	ctrl_units = foo[which(foo$treatment==0),]
	trt_units = foo[which(foo$treatment==1),]

	# fit regression models for both groups
	lm_ctrl = lm(post.test ~ pre.test, data=ctrl_units)
	lm_trt = lm(post.test ~ pre.test, data=trt_units)

	# plot the data and the regression lines
	plot(x=c(0:125),y=c(0:125),type='n', xlab='Pre-test', ylab='Post-test') #following the original figure's layout (all points in the corner)
	#plot(x=c(70:125),y=c(70:125),type='n', xlab='Pre-test', ylab='Post-test') #a clearer version
	points(ctrl_units$pre.test,ctrl_units$post.test,pch=16)
	points(trt_units$pre.test,trt_units$post.test,pch=1)
	legend(x=0,y=125,legend=c('Treatment','Control'), lty=1:2)
	title('Grade 4 Post vs Pre test scores')
	abline(lm_trt)
	abline(lm_ctrl,lty=2)

	# b) changing the value of one datum to change the treatment effect

	trt_units[11,'post.test'] = 70
	lm_trt_b = lm(post.test ~ pre.test, data=trt_units)

	#plot the data and the regression lines
	#plot(x=c(0:125),y=c(0:125),type='n', xlab='Pre-test', ylab='Post-test') #following the original figure's layout (all points in the corner)
	plot(x=c(70:125),y=c(70:125),type='n', xlab='Pre-test', ylab='Post-test') #a clearer version
	points(ctrl_units$pre.test,ctrl_units$post.test,pch=16)
	points(trt_units$pre.test,trt_units$post.test,pch=1)
	points(104.1,70,col='red',pch=0, cex=2) #highlighting the modified point
	title('Grade 4 Post vs Pre test scores - One observation changed')
	legend(x=70,y=125,legend=c('Treatment','Control'), lty=1:2)
	abline(lm_trt_b)
	abline(lm_ctrl,lty=2)

	# c) replicate figure 9.8

	library(arm)
	lm_c = lm(post.test ~ treatment + pre.test +treatment:pre.test,data=foo)
	lm_c_sim = sim(lm_c, n.sims=20)
	plot (0, 0, xlim=range(foo$pre.test), ylim=c(-5,15),
	xlab="Pre-test", ylab="Treatment effect",
	main="Treatment effect for Grade 4")
	abline (0, 0, lwd=.5, lty=2)
	x = range(foo$pre.test)
	for (i in 1:20){
	curve(lm_c_sim@coef[i,2] + lm_c_sim@coef[i,4]*x, lwd=.5, col="gray",
	add=TRUE)}
	curve(coef(lm_c)[2] + coef(lm_c)[4]*x, lwd=.5, add=TRUE)
	legend(x=105,y=15,legend=c('Main regression','Simulated Results'), lty=1,col=c('black','grey'))

	# 95% confidence interval for the treatment effect
	pre_test = c(78:120)
	num_sims=100
	storage_matrix = matrix(NA, nrow=length(pre_test), ncol=num_sims)
	lm_c_sim = sim(lm_c,n.sims=num_sims)
	for (score in pre_test){
	for (i in 1:num_sims){
	storage_matrix[score-77,i] = lm_c_sim@coef[i,2] + lm_c_sim@coef[i,4]*score
	}
	}
	conf.int <- apply(storage_matrix, 1, quantile, probs = c(0.025, 0.975))
	lines(pre_test,conf.int[1,],lwd=0.5,col='red')
	lines(pre_test,conf.int[2,],lwd=0.5,col='red')
	legend(x=105,y=15,legend=c('Main regression','Simulated Results', '95% CI'), lty=1,col=c('black','grey', 'red'))

	# The uncertainty around the treatment effect is lowest around the x-values of 100-110
	# Since this is where most observation lie, so most instances of simulated regression
	# lines will be pulled towards the data in this area, decreasing the variation.

	##############
	# QUESTION 2:
	# Using the “tinting” data set in the “DAAG” package, fit a linear model that predicts csoa as a function of age, sex, target,
	# and I(tint != "no"), and I(tint!= "no"*age). Estimate:
	##############

	library(arm)
	tinting = read.csv(url("https://tinyurl.com/v4bq99k"))

	# a) the 95% interval of expected values for csoa, for typical treated female units, with target == “hicon”, ages 20, 30, 40,
	# 50, 60, 70, and 80 using simulation (i.e., 1000 simulated predictions for every row from 1000 sets of coefficients).
	# In this data set, a “treated unit” is a unit for which tint != “no” is TRUE.

	# Fit the regression model
	lm_tint <-
	lm(csoa ~ age + sex + target + I(as.numeric(tint != "no")) + I(as.numeric(tint!= "no")*age), data = tinting)
	summary(lm_tint)

	# Simulate
	iterations <- 1000
	sim <- sim(lm_tint, n.sims = iterations)

	# Predict csoa for every unit of age
	ages = c(20, 30, 40, 50, 60, 70, 80)
	simulated.trt <- matrix(NA, nrow = iterations, length(ages))

	for (age in ages) {
	Xs <- c(1, age, 0, 1, 1, 1*age) # 1 = intercept, age = age, 0 = sex(f), 1 = target(hicon), 1 = tint!='no' (treatment)

	for (i in 1:iterations) {
	simulated.trt[i, 1 + (age-min(ages))/10] <- sum(Xs*sim@coef[i,])
	}
	}

	# Table with the relevant point estimates (bounds of the prediction intervals of y for different ages)
	conf.intervals.trt <- apply(simulated.trt, 2, quantile, probs = c(0.025, 0.975))
	mean.trt <- apply(simulated.trt, 2, mean)
	table.mean.trt <- t(data.frame(conf.intervals.trt))
	colnames(table.mean.trt) <- c("Mean CI Lower Bound", "Mean CI Upper Bound")
	rownames(table.mean.trt) <- ages
	View(table.mean.trt)

	# Plot the predicted csoa (y-axis) against age (x-axis)
	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(min(ages),max(ages)),
	ylim = c(0,100),
	main = "csoa by age for typical treated female units", xlab = "age",
	ylab = "csoa")

	for (age in ages) {
	segments(
	x0 = age,
	y0 = conf.intervals.trt[1, 1 + (age-min(ages))/10],
	x1 = age,
	y1 = conf.intervals.trt[2, 1 + (age-min(ages))/10],
	lwd = 1)
	points(ages, mean.trt, col = "red", pch=16)
	}

	# b) the 95% prediction interval for the treatment effect, for typical female units. Please follow the instructions in (a),
	# using simulation.

	# Predict csoa for every unit of age
	ages = c(20, 30, 40, 50, 60, 70, 80)
	simulated.ctrl <- matrix(NA, nrow = iterations, ncol = length(ages))

	for (age in ages) {
	Xs <- c(1, age, 0, 1, 1, 0*age) # 1 = intercept, age = age, 0 = sex(f), 1 = target(hicon), 1 = tint!='no', 0 = control unit

	for (i in 1:iterations) {
	simulated.ctrl[i, 1 + (age-min(ages))/10] <- sum(Xs*sim@coef[i,])
	}
	}

	simulated.trt.effect = simulated.trt - simulated.ctrl

	# Table with the relevant point estimates (bounds of the prediction intervals of y for different ages)
	conf.intervals.trt.effect <- apply(simulated.trt.effect, 2, quantile, probs = c(0.025, 0.975))
	mean.trt.effect <- apply(simulated.trt.effect, 2, mean)
	table.mean.trt.effect <- t(data.frame(conf.intervals.trt.effect))
	colnames(table.mean.trt.effect) <- c("Mean CI Lower Bound", "Mean CI Upper Bound")
	rownames(table.mean.trt.effect) <- ages
	View(table.mean.trt.effect)

	# Plot the predicted csoa (y-axis) against age (x-axis)
	plot(x = c(1:100), y = c(1:100), type = "n",
	xlim = c(min(ages),max(ages)),
	ylim = c(-20,20),
	main = "treatment effect by age for typical female units", xlab = "age",
	ylab = "treatment effect")

	for (age in ages) {
	segments(
	x0 = age,
	y0 = conf.intervals.trt.effect[1, 1 + (age-min(ages))/10],
	x1 = age,
	y1 = conf.intervals.trt.effect[2, 1 + (age-min(ages))/10],
	lwd = 1)
	points(ages, mean.trt.effect, col = "red", pch=16)
	}

	##############
	# QUESTION 3:
	# Write your own function (8 lines max) that takes Ys and predicted Ys as inputs, and outputs R2.
	# Copy/paste an example using the lalonde data (from the Matching library) that shows your code is working correctly --
	# (i.e., show that the summary(lm()) command outputs a very similar R2 as your own function does.
	##############

	# R squared function with bootstrapping
	rsquared <- function(ytrue, ypred) {
	res <- sum((ytrue - ypred)**2)
	tot <- sum((ytrue - mean(ytrue))**2)
	return(1-res/tot)
	}

	# Apply function to lalonde dataset
	library(Matching)
	data(lalonde)

	lm.test = lm(age ~ educ, data = lalonde)
	rsquared(lalonde$age, lm.test$fitted.values)
	summary(lm.test)$r.sq


	##############
	# QUESTION 4:
	# Bootstrap the 95% confidence interval of the treatment effect using the mazedata1.dta
	# data (define the binary treatment condition as “Caste Revealed” vs. not “Caste
	# Revealed, and define outcomes as “round1”. No covariates (independent variables) are
	# needed. Use 10000 bootstrap samples.

	#Don’t use a ‘canned’ bootstrap function -- please code the bootstrap routine manually.

	#Then, obtain the a conventional confidence interval for the treatment effect using the
	#“confint(lm())” function.
	##############

	library(foreign)

	# Load the dta file from your local folder
	foo4 = read.dta("C:/Users/tomkr/Documents/Minerva/Work Study/Buenos Aires/mazedata1.dta")

	# Create a new treatment indicator for Caste Revealed = Treatment, the rest in Control
	new_trt = rep(0,nrow(foo4))
	new_trt[which(foo4$treatment=='Caste Revealed')] = 1
	foo4$trt = new_trt

	# Bootstrapping function
	iterations <- 10000
	storage <- rep(NA, iterations)
	for (i in 1:iterations) {
	# Fit a linear model using the outcome variable and the newly-defined treatment indicator
	lm.maze = lm(round1 ~ trt, data = foo4[sample(1:nrow(foo4), nrow(foo4), replace = T),])
	storage[i] <- lm.maze$coefficients[2]
	}

	# Find simulated confidence interval
	quantile(storage, c(0.025, 0.975))

	# Find analytical confidence interval
	lm.maze = lm(round1 ~ trt, data = foo4)
	confint(lm.maze)[2,]

	# Visualize simulated and analytical CI with table and histogram

	# a) A table with the relevant results (bounds on the 2 confidence intervals).
	data.frame("simulation" = quantile(storage, c(0.025, 0.975)), "analytical" = confint(lm.maze)[2,])

	# b) 1 histogram (properly labeled) showing your bootstrap-sample results. How you do this one is up to you.
	hist(storage, main = "Bootstrapped Values for the Treatment Effect",
	xlab = "Treatment Effect", ylab = "Count")
	abline(v=quantile(storage, c(0.025, 0.975)), col="red")
	abline(v=confint(lm.maze)[2,], col="blue")
	legend(0.5,1500, legend=c("bootstrap 95% confidence interval","analytical 95% confidence interval"),
	col=c("red","blue"), lty=1:1, cex=0.5)

	# c) No more than 3 sentences summarizing the results and drawing any conclusions you find relevant and interesting.

	# Sample response: Coefficient of the treatment variable in the linear model indicates the treatment effect, ~ -0.132.
	# The 95% confidence interval ranges from ~ -0.584 to ~ 0.319, which means we lack strong evidence about the direction
	# of the treatment effect. Analytical and simulation approaches yielded similar CIs, which provided extra confidence in this conclusion.

	##############
	# QUESTION 5:
	# Obtain the nsw.dta dataset from: foo <- read.csv(url("https://tinyurl.com/yx8tqf3k")).
	# Randomly remove 2000 observations, and set it aside as a test set (or validation set).
	# Use the remainder of the dataset to estimate 2 models of your choice of varying degrees of complexity
	# (in terms of the number of variables and/or their interactions), with the dependent variable being “treat”.
	# NOTE: re78 is not a predictor because it postdates the treatment. (In other words, it’s an outcome.)

	# Use LOOCV to estimate the test-set error (MSE) for each of your models. Then compare these LOOCV test-set error
	# estimates to the test-set error you obtain when you actually run your model on your 2000-row test set.

	# Provide a table that summarizes the 4 different sets of numbers (2 LOOCV estimates, and 2 test set error rates).
	# Explain your results in no more than a paragraph.
	##############

	# Taking dataset and subsetting dataset into training and testing dataset
	foo <- read.csv(url("https://tinyurl.com/yx8tqf3k"))
	set.seed(12345)
	test_set_rows <- sample(1:length(foo$age), 2000, replace = FALSE)
	test_foo = foo[test_set_rows,]
	train_foo = foo[-test_set_rows,]

	# 2 models of varying complexity
	glm1 = glm(treat ~ age, data = train_foo)
	glm2 = glm(treat ~ . -re78 + ageeducation + educationnodegree + marriedre74 + hispanicre75, data = train_foo)

	# Calculate MSE using test set
	MSE1 = mean((test_foo$treat - predict.lm(glm1, test_foo))**2)
	MSE2 = mean((test_foo$treat - predict.lm(glm2, test_foo))**2)

	# Estimate MSE using LOOCV
	library(boot)
	LOOCV1 = cv.glm(train_foo, glm1)$delta[1]
	LOOCV2 = cv.glm(train_foo, glm2)$delta[1]
	TENCV1 = cv.glm(train_foo, glm1, K = 10)$delta[1]
	TENCV2 = cv.glm(train_foo, glm2, K = 10)$delta[1]


	# Table that summarizes the 5 LOOCV estimates and 5 test set MSEs
	data.frame("Test set MSE" = c(MSE1, MSE2), "LOOCV MSE" = c(LOOCV1, LOOCV2), "TENCV MSE" = c(TENCV1, TENCV2))

	# Test set MSE LOOCV MSE TENCV
	# simple model 0.009836669 0.011449698 0.0114505923
	# complex model 0.008975607 0.009969941 0.0099679393

	# Sample response: Comparing the test set MSEs and the LOOCV-approximated MSEs, we see that LOOCV does a decent approximation
	# of the MSE (~0.002 difference). LOOCV did overapproximate the MSE in both cases. However, it may be okay in practice, because
	# it doesn't give the impression that the model is better than it actually is. The trade-off of using LOOCV is that it is very
	# computationally and temporally expensive (takes a lot of memory and time to run). Additionally, the more complex model
	# gives a better MSE and LOOCV value.


	##############
	# QUESTION 6:
	# Should briefly describe the the purpose of the
	# study, the methodology, and the policy implications/prescription.

	# Sample response:
	# This memo presents the results from the Anti-Binge Consumption (ABC) experiment that
	# tested a proposed program to reduce alcohol drinking among young adults aged 18 to 30.
	# Participants were sampled at several regional community centers and colleges and were
	# randomly assigned to either a series of lectures on Giant Pandas conservation (control,
	# n=108 or short one-on-one conversations with current and ex-alcohol addicts (treatment,
	# n=112). Both groups reported their dietary habits, including weekly alcohol consumption,
	# before and a month after the program. The results show that while before the experiment
	# weekly alcohol consumption was similar, post-experiment decrease in consumption was either
	# conclusive or not depending on age group. Consumption was statistically significantly lower
	# for 18-year-olds (95% intervals of 0-2 vs 3-10 drinks for treated and controls groups, respectively),
	# borderline significant for 20-year-olds and inconclusive for ages 22-26.
	# Consumption of treatment participants was rougly around the same range (roughly 0-4) for all
	# ages while among control participants it got lower with age increase such that the differences
	# diminished around ages 28-30. Thus, the results support administering the program to adults
	# aged 18-20, whereas more research is necessary to understand the effects on the 22-26 age group.


	##############
	# QUESTION 7 (Optional):
	# Figure out how to use the rgenoud genetic algorithm (you’ll have to install the rgenoud library) to run a least-squares
	# regression. You’ll have to write a function that calculates the sum of the squared residuals, and then you will feed that
	# function to rgenoud. Demonstrate that your genetic algorithm obtains the same (or very nearly the same) coefficients as
	# using the lm function.

	# Then, once you’ve established that your code is working correctly, write another function to run a least-absolute-deviations
	# regression (in which the genetic algorithm identifies the linear function that produces the smallest sum of ABSOLUTE
	# residuals). NOTE: You must add an argument “gradient.check = FALSE” to the genetic algorithm to ensure that it runs properly
	# (You are not expected to know or explain why).

	# Comment on the difference between the output (slope and intercept) of the two functions. Is one function more sensitive
	# to the existing outlier?
	##############

	library(rgenoud)

	data_outlier = read.csv(url("https://tinyurl.com/vfsoe4q"))

	# Function to identify coefs that minimize the sum of squared residuals:

	# We're going to assume the regression of interest is:
	# Y ~ intercept + slope*X

	SSR <- function(coef, y, x) {
	y = data_outlier$y
	x = data_outlier$x
	# for a given set of coefs, the function calculates the sum of squared residuals
	intercept = coef[1]
	slope = coef[2]
	return(sum((y - intercept - slopex)*2))
	}

	# Rgenoud genetic algorithm to minimize SSR
	GA_1 = genoud(SSR, nvars = 2, pop.size=1000, max.generations=100, wait.generations=10)

	#Parameters at the Solution (parameter, gradient):
	#
	# X[ 1] : 1.893287e-02 G[ 1] : 1.646590e-09
	# X[ 2] : 4.628064e-01 G[ 2] : -1.179106e-09

	# Simple regression
	lm = lm(y ~ x, data = data_outlier)
	summary(lm)

	# Coefficients:
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 0.01893 0.11321 0.167 0.867523
	# x 0.46281 0.12273 3.771 0.000277 ***


	SSAR <- function(coef, y, x) {
	y = data_outlier$y
	x = data_outlier$x
	# for a given set of coefs, the function calculates the sum of absolute residuals
	intercept = coef[1]
	slope = coef[2]
	return(sum(abs(y - intercept - slope*x)))
	}

	GA_2 = genoud(SSAR, nvars = 2, pop.size=1000, max.generations=100, wait.generations=10, gradient.check = FALSE)

	#Parameters at the Solution (parameter, gradient):
	#
	# X[ 1] : -1.781644e-01 G[ 1] : 1.000000e+00
	# X[ 2] : 3.543404e-01 G[ 2] : 3.829967e-01

	# Sample response: Comparing the slopes that each optimizer generated with their respective loss functions, we observe that
	# the SSR function has a more positive slope. This is because the SSR cost function "punishes" outliers more than the SSAR
	# function. Thus, the slope will change more drastically to the presence of outliers to minimize the cost function.