anglilian · February 22, 2020 17:05
diff --git a/Q1_CS112_Assignment2.r b/Q1_CS112_Assignment2.r
  library(ggplot2)
 library(arm)
 ###QUESTION 1###
 ##PART A##
 #Loading data from the CSV file
 sesame <- read.csv("https://tinyurl.com/w1g163b")

 #Splitting the data into two datasets with control and treatment data
 sesame_treat <- subset(sesame, sesame$treatment == 1)
 sesame_cont <- subset(sesame, sesame$treatment == 0)

 #Creating a linear regression model for treatment and control data
 lm_pre_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat)
 lm_pre_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont)

 #Plotting the data for pre test and post test along with the respective regression lines
 plot(x = sesame_cont$pre.test, y = sesame_cont$post.test, main = "Grade 4" ,
     pch = 16, xlab = "Pre test", ylab = "Post test", xlim= c(0, max(sesame$pre.test)), ylim = c(0,max(sesame$post.test)))
 lines (x = sesame_treat$pre.test, y=sesame_treat$post.test, type = "p")
 abline(lm_pre_treat)
 abline(lm_pre_cont, lty= 'dashed')

 ##PART B##
 sesame_edit <- read.csv("https://tinyurl.com/w1g163b")

 #Change point 4 from post test to 45 from 104.6.
 sesame_edit[5, 1] = 45

 #Splitting the data into two datasets with control and treatment data
 sesame_treat_edit <- subset(sesame_edit, sesame_edit$treatment == 1)
 sesame_cont_edit <- subset(sesame_edit, sesame_edit$treatment == 0)

 #Creating a linear regression model for treatment and control data
 lm_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat_edit)
 lm_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont_edit)

 #Plotting the data for pre test and post test along with the respective regression lines
 plot(x = sesame_treat_edit$pre.test, y=sesame_treat_edit$post.test, main = "Grade 4" ,
    xlab = "Pre test", ylab = "Post test", col=ifelse(sesame_edit$post.test==45.0, "red", "black"), pch=ifelse(sesame_edit$post.test==45.0,16, 1) )
 lines (x = sesame_cont_edit$pre.test, y = sesame_cont_edit$post.test,
       type = "p", pch = 16
       ) #changes the colour of the point to red
 abline(lm_treat)
 abline(lm_cont, lty= 'dashed')

 ##PART C##
 #Create a linear regression model for the post test score
 lm.4 <- lm(post.test ~ treatment + pre.test + treatment:pre.test, 
           data = sesame)

 1#simulate the coefficients of the model
 lm.4.sim <- sim(lm.4)
 lm.4.sim

 # creating an empty plot
 plot(0, 0, xlim= range(sesame$pre.test), ylim = c(-5,10),
     xlab = "Pre-test", ylab = "Treatment effect",
     main = "Treatment Effect in Grade 4")

 #adds a dotted line along the x-axis
 abline (0,0, lwd = .5, lty =2)

 #adds 20 lines to the plot using the simulated coefficients
 for (i in 1:20){
  curve(lm.4.sim@coef[i,2]+lm.4.sim@coef[i,4]*x,
        lwd=.5, col="gray", add= TRUE)}

 #adds a line that is the difference between the two regression lines in the previous plot
 curve(coef(lm.4)[2]+coef(lm.4)[4]*x, lwd=.5, add=TRUE)

 #histogram of pre test scores
 hist(sesame$pre.test)
diff --git a/Q2_CS112_Assignment2.r b/Q2_CS112_Assignment2.r
 ###QUESTION 2###
 tinting = read.csv(url("https://tinyurl.com/v4bq99k")) #loading data
 lm.tint <- lm(csoa~ age + sex + target + I(tint!="no") + 
                I(as.numeric(tint!="no")*age), data = tinting) #setting the linear model

 #Simulating 95% interval and mean of csoa
 set.seed(123) #setting randomness
 iteration <- 1000 #number of simulations to run 
 sim_treated <-sim(lm.tint, n.sims=iteration) #running the simulations 1000 times on linear model

 agevector <- c(20,30,40,50,60,70,80)
 storage.matrix <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age
 for (age in agevector){ #iterating through each age in age vector
  for (i in 1:1000) { #1000 times for each age
    treated <- c(1, age, 0, 0, TRUE, as.numeric(TRUE)*age) #setting the variables as stipulated
    storage.matrix[i, age/10-1] <-sum(treated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
  }
 }



 quantile(storage.matrix[,7], c(0.025,0.975))

 #Creating a table of results
 means <- apply(storage.matrix, 2, mean) #computing mean of each age
 conf.intervals <- apply(storage.matrix, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
 table_ages <- rbind(means, conf.intervals) #putting data together in one table
 colnames(table_ages) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
 table_ages

 #Plotting results onto a graph
 plot(x = c(1:100), xlim= c(20,80) , ylim = c(30,70), type = "n", xlab = "Age (Years)", 
     ylab = "CSOA", main= "Expected Values of Treated Females")
 for (age in agevector) {
  segments( #plots the confidence intervals of each age
    x0 = age,
    y0 = conf.intervals[1, age/10-1], 
    x1 = age,
    y1 = conf.intervals[2, age/10-1],
    lwd = 2)
 }
 lines(agevector, means, pch=16, col='red', type='p') #plot the means of each age


 ##PART B##
 storage.matrix.cont <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age

 #Simulating csoa values for control group
 for (age in agevector){ #iterating through each age in age vector
  for (i in 1:1000) { #1000 times for each age
    untreated <- c(1, age, 0, 0, 0 , as.numeric(FALSE)*age) #setting the variables as stipulated
    storage.matrix.cont[i, age/10-1] <-sum(untreated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
  }
 }

 #calculating the treatment effect for each entry by subtracting the treated from the controlled
 storage.treat <- storage.matrix - storage.matrix.cont

 #Creating a table of results
 means.treat <-apply(storage.treat, 2, mean) #computing mean of each age
 conf.intervals.treat <- apply(storage.treat, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
 table_ages.treat <- rbind(means.treat, conf.intervals.treat) #putting data together in one table
 colnames(table_ages.treat) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
 table_ages.treat

 #Plotting results onto a graph
 plot(x = c(1:100), xlim= c(20,80) , ylim = c(-10,10), type = "n", xlab = "Age (Years)", 
     ylab = "CSOA", main= "Treatment Effect of Females")
 for (age in agevector) {
  segments(
    x0 = age,
    y0 = conf.intervals.treat[1, age/10-1],
    x1 = age,
    y1 = conf.intervals.treat[2, age/10-1],
    lwd = 2)
 }
 lines(agevector, means.treat, pch=16, col='red', type='p') #plotting each of the means
diff --git a/Q3_CS112_Assignment2.r b/Q3_CS112_Assignment2.r
 ###QUESTION 3###

 #Function for R squared that takes the two arguments: y and the predicted y
 rsquare <- function(y, yhat) {
  sumsquaredregression <- 0
  totalsumofsquares <- 0
  for (i in 1:length(y)) {
  sumsquaredregression <- sumsquaredregression + (y[i]-yhat[i])^2 #sum of squared regression
  totalsumofsquares <- totalsumofsquares + (y[i]-mean(y))^2 #total sum of squares
  }
  print (paste("The R-squared value is", 
        1-(sumsquaredregression/totalsumofsquares))) #equation for R squared
 } 

 #loading data for lalonde
 library(Matching)
 data(lalonde)

 #creating a linear model for lalonde to predict for re78
 lm.lalonde <- lm(re78 ~., data= lalonde)
 yhat <- predict(lm.lalonde)

 #Output r-squared values
 rsquare(lalonde$re78, yhat)
 summary(lm.lalonde)$r.squared
diff --git a/Q4_CS112_Assignment2.r b/Q4_CS112_Assignment2.r
 ##Question 4###
 #loading data
 library(foreign)
 maze <- read.dta("mazedata1.dta")

 #setting the treatment to be binary where Caste Revealed is 1 and 0 otherwise
 treat <- ifelse(maze$treatment == "Caste Revealed", 1, 0)
 maze <- data.frame(maze, treat) #appending binary version of treatment to dataframe

 #holds all the round1 values for treatment group
 treated <- maze$round1[which(maze$treat==1)]

 #holds all the round1 values for control group
 control <- maze$round1[which(maze$treat==0)]

 #create storage list
 store <- c()

 #bootstrap the treatment effect for 10,000 samples
 for (x in 1:10000) {
  store[x] <- mean(sample(treated, length(treated), replace = TRUE)) -
              mean(sample(control, length(control), replace = TRUE)) #subtract the control from the treatment
 }

 #store lower and upper boundary of confidence interval of the bootstrapped treatment effects
 confidenceinterval <- quantile(store, c(0.025,0.975))

 #linear model with round1 as dependent variable and treatment as independent variable
 lm.maze = lm(round1 ~ treat, data = maze)

 #computing confidence interval using function based on linear model
 confint <- rbind (confidenceinterval, confint(lm.maze)[2,]) #attach bootstrapped value in one table
 confint

 #create a probability density histogram of treatment effect
 hist(store, xlab = "Treatment Effect", ylab= 'Probability Density', main= 'Probability Density of Treatment Effect', freq=FALSE)
diff --git a/Q5_CS112_Assignment2.r b/Q5_CS112_Assignment2.r
 ###QUESTION 5###
 #loading data
 foo <- read.csv(url("https://tinyurl.com/yx8tqf3k"))

 #setting randomness
 set.seed(12345)

 #removing 2000 rows randomly from the original data set
 test_set_rows <- sample(1:length(foo$age), 2000, replace =FALSE) #sampling 2000 index numbers to removes
 test_foo <- foo[-test_set_rows,] #removing the rows from the data set

 #Building the model for surname A-F
 glm.simple <- glm(treat~ age, data = test_foo)
 glm.complex <- glm(treat~. -re78, data = test_foo)

 library(boot)

 ##LOOCV
 #using the cross validation function that by default sets the number of folds to 1 for LOOCV
 LOOCV1 = cv.glm(test_foo, glm.simple)$delta[1] #simple model's MSE

 LOOCV2 = cv.glm(test_foo, glm.complex)$delta[1] #complex model's MSE

 ##10-fold cross validation
 #using the cross-validation function and setting the number of folds to 10 and averaging the 10 MSEs
 cv.error.simple.10 = rep(0,10) #stores each CV result
 for(i in 1:10) { #loops 10 times to attach each result
  cv.error.simple.10[i]=cv.glm(test_foo, glm.simple, K=10)$delta[1] #performs 10-fold CV
 }
 mean(cv.error.simple.10) #computes mean for simple model

 cv.error.complex.10 = rep(0,10) #stores each CV result
 for(i in 1:10) { #loops 10 times to attach each result
  cv.error.complex.10[i]=cv.glm(test_foo, glm.complex, K=10)$delta[1] #performs 10-fold CV
 }
 mean(cv.error.complex.10) #computes mean for complex model

 ##test set error
 #subtracts the actual y value with the predicted y value from each model, squares each result and calculates the average

 mean((foo[test_set_rows,]$treat -predict.glm(glm.simple, newdata=foo[test_set_rows,]))^2) #simple model

 mean((foo[test_set_rows,]$treat -predict.glm(glm.complex, newdata=foo[test_set_rows,]))^2) #complex model
diff --git a/Q6_CS112_Assignment2.r b/Q6_CS112_Assignment2.r
 ###QUESTION 6
 trt = matrix(NA,nrow=2,ncol=7)
 ctrl = matrix(NA,nrow=2,ncol=7) 

 trt[,1]=c(0, 2) #18
 ctrl[,1]=c(3, 10)
 trt[,2]=c(0, 3) #20
 ctrl[,2]=c(2, 8)
 trt[,3]=c(0, 4) #22
 ctrl[,3]=c(2, 7)
 trt[,4]=c(1, 3) #24
 ctrl[,4]=c(2, 6)
 trt[,5]=c(1, 3) #26
 ctrl[,5]=c(2, 5)
 trt[,6]=c(1, 3) #28
 ctrl[,6]=c(2, 4)
 trt[,7]=c(1, 2) #30
 ctrl[,7]=c(1, 3)


 c1 = rgb(red = 1, green = 0, blue = 0, alpha = 0.5) #trt
 c2 = rgb(red = 0, green = 0, blue = 1, alpha = 0.5) #ctrl

 plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,31), ylim = c(0,11), cex.lab=1.2,
     main = "Alcohol Consumption - 95% Prediction Intervals", xlab = "Age",ylab = "Drinks per Week")

 for (age in seq(from=18,to=30,by=2)) { 
  segments(x0 = age-0.05, y0 = trt[1, (age-18)/2+1],
           x1 = age-0.05, y1 = trt[2, (age-18)/2+1],lwd = 3,col=c1)
  
  segments(x0 = age+0.05, y0 = ctrl[1, (age-18)/2+1],
           x1 = age+0.05, y1 = ctrl[2, (age-18)/2+1],lwd = 3,col=c2)
 }
 legend('topright',legend=c('Treatment','Control'),fill=c(c1,c2))
 mtext("https://tinyurl.com/vwxuwop", side = 1, cex = 0.5, adj = 0, padj = 10)
	library(ggplot2)
	library(arm)
	###QUESTION 1###
	##PART A##
	#Loading data from the CSV file
	sesame <- read.csv("https://tinyurl.com/w1g163b")

	#Splitting the data into two datasets with control and treatment data
	sesame_treat <- subset(sesame, sesame$treatment == 1)
	sesame_cont <- subset(sesame, sesame$treatment == 0)

	#Creating a linear regression model for treatment and control data
	lm_pre_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat)
	lm_pre_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont)

	#Plotting the data for pre test and post test along with the respective regression lines
	plot(x = sesame_cont$pre.test, y = sesame_cont$post.test, main = "Grade 4" ,
	pch = 16, xlab = "Pre test", ylab = "Post test", xlim= c(0, max(sesame$pre.test)), ylim = c(0,max(sesame$post.test)))
	lines (x = sesame_treat$pre.test, y=sesame_treat$post.test, type = "p")
	abline(lm_pre_treat)
	abline(lm_pre_cont, lty= 'dashed')

	##PART B##
	sesame_edit <- read.csv("https://tinyurl.com/w1g163b")

	#Change point 4 from post test to 45 from 104.6.
	sesame_edit[5, 1] = 45

	#Splitting the data into two datasets with control and treatment data
	sesame_treat_edit <- subset(sesame_edit, sesame_edit$treatment == 1)
	sesame_cont_edit <- subset(sesame_edit, sesame_edit$treatment == 0)

	#Creating a linear regression model for treatment and control data
	lm_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat_edit)
	lm_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont_edit)

	#Plotting the data for pre test and post test along with the respective regression lines
	plot(x = sesame_treat_edit$pre.test, y=sesame_treat_edit$post.test, main = "Grade 4" ,
	xlab = "Pre test", ylab = "Post test", col=ifelse(sesame_edit$post.test==45.0, "red", "black"), pch=ifelse(sesame_edit$post.test==45.0,16, 1) )
	lines (x = sesame_cont_edit$pre.test, y = sesame_cont_edit$post.test,
	type = "p", pch = 16
	) #changes the colour of the point to red
	abline(lm_treat)
	abline(lm_cont, lty= 'dashed')

	##PART C##
	#Create a linear regression model for the post test score
	lm.4 <- lm(post.test ~ treatment + pre.test + treatment:pre.test,
	data = sesame)

	1#simulate the coefficients of the model
	lm.4.sim <- sim(lm.4)
	lm.4.sim

	# creating an empty plot
	plot(0, 0, xlim= range(sesame$pre.test), ylim = c(-5,10),
	xlab = "Pre-test", ylab = "Treatment effect",
	main = "Treatment Effect in Grade 4")

	#adds a dotted line along the x-axis
	abline (0,0, lwd = .5, lty =2)

	#adds 20 lines to the plot using the simulated coefficients
	for (i in 1:20){
	curve(lm.4.sim@coef[i,2]+lm.4.sim@coef[i,4]*x,
	lwd=.5, col="gray", add= TRUE)}

	#adds a line that is the difference between the two regression lines in the previous plot
	curve(coef(lm.4)[2]+coef(lm.4)[4]*x, lwd=.5, add=TRUE)

	#histogram of pre test scores
	hist(sesame$pre.test)
	###QUESTION 2###
	tinting = read.csv(url("https://tinyurl.com/v4bq99k")) #loading data
	lm.tint <- lm(csoa~ age + sex + target + I(tint!="no") +
	I(as.numeric(tint!="no")*age), data = tinting) #setting the linear model

	#Simulating 95% interval and mean of csoa
	set.seed(123) #setting randomness
	iteration <- 1000 #number of simulations to run
	sim_treated <-sim(lm.tint, n.sims=iteration) #running the simulations 1000 times on linear model

	agevector <- c(20,30,40,50,60,70,80)
	storage.matrix <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age
	for (age in agevector){ #iterating through each age in age vector
	for (i in 1:1000) { #1000 times for each age
	treated <- c(1, age, 0, 0, TRUE, as.numeric(TRUE)*age) #setting the variables as stipulated
	storage.matrix[i, age/10-1] <-sum(treated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
	}
	}



	quantile(storage.matrix[,7], c(0.025,0.975))

	#Creating a table of results
	means <- apply(storage.matrix, 2, mean) #computing mean of each age
	conf.intervals <- apply(storage.matrix, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
	table_ages <- rbind(means, conf.intervals) #putting data together in one table
	colnames(table_ages) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
	table_ages

	#Plotting results onto a graph
	plot(x = c(1:100), xlim= c(20,80) , ylim = c(30,70), type = "n", xlab = "Age (Years)",
	ylab = "CSOA", main= "Expected Values of Treated Females")
	for (age in agevector) {
	segments( #plots the confidence intervals of each age
	x0 = age,
	y0 = conf.intervals[1, age/10-1],
	x1 = age,
	y1 = conf.intervals[2, age/10-1],
	lwd = 2)
	}
	lines(agevector, means, pch=16, col='red', type='p') #plot the means of each age


	##PART B##
	storage.matrix.cont <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age

	#Simulating csoa values for control group
	for (age in agevector){ #iterating through each age in age vector
	for (i in 1:1000) { #1000 times for each age
	untreated <- c(1, age, 0, 0, 0 , as.numeric(FALSE)*age) #setting the variables as stipulated
	storage.matrix.cont[i, age/10-1] <-sum(untreated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa
	}
	}

	#calculating the treatment effect for each entry by subtracting the treated from the controlled
	storage.treat <- storage.matrix - storage.matrix.cont

	#Creating a table of results
	means.treat <-apply(storage.treat, 2, mean) #computing mean of each age
	conf.intervals.treat <- apply(storage.treat, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age
	table_ages.treat <- rbind(means.treat, conf.intervals.treat) #putting data together in one table
	colnames(table_ages.treat) <- c(20,30,40,50,60,70,80) #change column names to appropriate age
	table_ages.treat

	#Plotting results onto a graph
	plot(x = c(1:100), xlim= c(20,80) , ylim = c(-10,10), type = "n", xlab = "Age (Years)",
	ylab = "CSOA", main= "Treatment Effect of Females")
	for (age in agevector) {
	segments(
	x0 = age,
	y0 = conf.intervals.treat[1, age/10-1],
	x1 = age,
	y1 = conf.intervals.treat[2, age/10-1],
	lwd = 2)
	}
	lines(agevector, means.treat, pch=16, col='red', type='p') #plotting each of the means
	###QUESTION 3###

	#Function for R squared that takes the two arguments: y and the predicted y
	rsquare <- function(y, yhat) {
	sumsquaredregression <- 0
	totalsumofsquares <- 0
	for (i in 1:length(y)) {
	sumsquaredregression <- sumsquaredregression + (y[i]-yhat[i])^2 #sum of squared regression
	totalsumofsquares <- totalsumofsquares + (y[i]-mean(y))^2 #total sum of squares
	}
	print (paste("The R-squared value is",
	1-(sumsquaredregression/totalsumofsquares))) #equation for R squared
	}

	#loading data for lalonde
	library(Matching)
	data(lalonde)

	#creating a linear model for lalonde to predict for re78
	lm.lalonde <- lm(re78 ~., data= lalonde)
	yhat <- predict(lm.lalonde)

	#Output r-squared values
	rsquare(lalonde$re78, yhat)
	summary(lm.lalonde)$r.squared
	##Question 4###
	#loading data
	library(foreign)
	maze <- read.dta("mazedata1.dta")

	#setting the treatment to be binary where Caste Revealed is 1 and 0 otherwise
	treat <- ifelse(maze$treatment == "Caste Revealed", 1, 0)
	maze <- data.frame(maze, treat) #appending binary version of treatment to dataframe

	#holds all the round1 values for treatment group
	treated <- maze$round1[which(maze$treat==1)]

	#holds all the round1 values for control group
	control <- maze$round1[which(maze$treat==0)]

	#create storage list
	store <- c()

	#bootstrap the treatment effect for 10,000 samples
	for (x in 1:10000) {
	store[x] <- mean(sample(treated, length(treated), replace = TRUE)) -
	mean(sample(control, length(control), replace = TRUE)) #subtract the control from the treatment
	}

	#store lower and upper boundary of confidence interval of the bootstrapped treatment effects
	confidenceinterval <- quantile(store, c(0.025,0.975))

	#linear model with round1 as dependent variable and treatment as independent variable
	lm.maze = lm(round1 ~ treat, data = maze)

	#computing confidence interval using function based on linear model
	confint <- rbind (confidenceinterval, confint(lm.maze)[2,]) #attach bootstrapped value in one table
	confint

	#create a probability density histogram of treatment effect
	hist(store, xlab = "Treatment Effect", ylab= 'Probability Density', main= 'Probability Density of Treatment Effect', freq=FALSE)
	###QUESTION 5###
	#loading data
	foo <- read.csv(url("https://tinyurl.com/yx8tqf3k"))

	#setting randomness
	set.seed(12345)

	#removing 2000 rows randomly from the original data set
	test_set_rows <- sample(1:length(foo$age), 2000, replace =FALSE) #sampling 2000 index numbers to removes
	test_foo <- foo[-test_set_rows,] #removing the rows from the data set

	#Building the model for surname A-F
	glm.simple <- glm(treat~ age, data = test_foo)
	glm.complex <- glm(treat~. -re78, data = test_foo)

	library(boot)

	##LOOCV
	#using the cross validation function that by default sets the number of folds to 1 for LOOCV
	LOOCV1 = cv.glm(test_foo, glm.simple)$delta[1] #simple model's MSE

	LOOCV2 = cv.glm(test_foo, glm.complex)$delta[1] #complex model's MSE

	##10-fold cross validation
	#using the cross-validation function and setting the number of folds to 10 and averaging the 10 MSEs
	cv.error.simple.10 = rep(0,10) #stores each CV result
	for(i in 1:10) { #loops 10 times to attach each result
	cv.error.simple.10[i]=cv.glm(test_foo, glm.simple, K=10)$delta[1] #performs 10-fold CV
	}
	mean(cv.error.simple.10) #computes mean for simple model

	cv.error.complex.10 = rep(0,10) #stores each CV result
	for(i in 1:10) { #loops 10 times to attach each result
	cv.error.complex.10[i]=cv.glm(test_foo, glm.complex, K=10)$delta[1] #performs 10-fold CV
	}
	mean(cv.error.complex.10) #computes mean for complex model

	##test set error
	#subtracts the actual y value with the predicted y value from each model, squares each result and calculates the average

	mean((foo[test_set_rows,]$treat -predict.glm(glm.simple, newdata=foo[test_set_rows,]))^2) #simple model

	mean((foo[test_set_rows,]$treat -predict.glm(glm.complex, newdata=foo[test_set_rows,]))^2) #complex model
	###QUESTION 6
	trt = matrix(NA,nrow=2,ncol=7)
	ctrl = matrix(NA,nrow=2,ncol=7)

	trt[,1]=c(0, 2) #18
	ctrl[,1]=c(3, 10)
	trt[,2]=c(0, 3) #20
	ctrl[,2]=c(2, 8)
	trt[,3]=c(0, 4) #22
	ctrl[,3]=c(2, 7)
	trt[,4]=c(1, 3) #24
	ctrl[,4]=c(2, 6)
	trt[,5]=c(1, 3) #26
	ctrl[,5]=c(2, 5)
	trt[,6]=c(1, 3) #28
	ctrl[,6]=c(2, 4)
	trt[,7]=c(1, 2) #30
	ctrl[,7]=c(1, 3)


	c1 = rgb(red = 1, green = 0, blue = 0, alpha = 0.5) #trt
	c2 = rgb(red = 0, green = 0, blue = 1, alpha = 0.5) #ctrl

	plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,31), ylim = c(0,11), cex.lab=1.2,
	main = "Alcohol Consumption - 95% Prediction Intervals", xlab = "Age",ylab = "Drinks per Week")

	for (age in seq(from=18,to=30,by=2)) {
	segments(x0 = age-0.05, y0 = trt[1, (age-18)/2+1],
	x1 = age-0.05, y1 = trt[2, (age-18)/2+1],lwd = 3,col=c1)

	segments(x0 = age+0.05, y0 = ctrl[1, (age-18)/2+1],
	x1 = age+0.05, y1 = ctrl[2, (age-18)/2+1],lwd = 3,col=c2)
	}
	legend('topright',legend=c('Treatment','Control'),fill=c(c1,c2))
	mtext("https://tinyurl.com/vwxuwop", side = 1, cex = 0.5, adj = 0, padj = 10)