Forked from tomkreker/CS112-Assignment 2 Solutions (Spring 2019).R
Last active
February 23, 2020 18:40
-
-
Save viniciusmss/11be8bce3ca1653e9087019327bd04e6 to your computer and use it in GitHub Desktop.
CS112-Assignment 2 Solutions (Spring 2019).R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############## | |
# QUESTION 1: | |
# a) replicating figure 9.7 (right panel) from Gelman et. al. | |
# separating treatment and control data | |
foo = read.csv("https://tinyurl.com/wlgl63b") | |
ctrl_units = foo[which(foo$treatment==0),] | |
trt_units = foo[which(foo$treatment==1),] | |
# fit regression models for both groups | |
lm_ctrl = lm(post.test ~ pre.test, data=ctrl_units) | |
lm_trt = lm(post.test ~ pre.test, data=trt_units) | |
# plot the data and the regression lines | |
plot(x=c(0:125),y=c(0:125),type='n', xlab='Pre-test', ylab='Post-test') #following the original figure's layout (all points in the corner) | |
#plot(x=c(70:125),y=c(70:125),type='n', xlab='Pre-test', ylab='Post-test') #a clearer version | |
points(ctrl_units$pre.test,ctrl_units$post.test,pch=16) | |
points(trt_units$pre.test,trt_units$post.test,pch=1) | |
legend(x=0,y=125,legend=c('Treatment','Control'), lty=1:2) | |
title('Grade 4 Post vs Pre test scores') | |
abline(lm_trt) | |
abline(lm_ctrl,lty=2) | |
# b) changing the value of one datum to change the treatment effect | |
trt_units[11,'post.test'] = 70 | |
lm_trt_b = lm(post.test ~ pre.test, data=trt_units) | |
#plot the data and the regression lines | |
#plot(x=c(0:125),y=c(0:125),type='n', xlab='Pre-test', ylab='Post-test') #following the original figure's layout (all points in the corner) | |
plot(x=c(70:125),y=c(70:125),type='n', xlab='Pre-test', ylab='Post-test') #a clearer version | |
points(ctrl_units$pre.test,ctrl_units$post.test,pch=16) | |
points(trt_units$pre.test,trt_units$post.test,pch=1) | |
points(104.1,70,col='red',pch=0, cex=2) #highlighting the modified point | |
title('Grade 4 Post vs Pre test scores - One observation changed') | |
legend(x=70,y=125,legend=c('Treatment','Control'), lty=1:2) | |
abline(lm_trt_b) | |
abline(lm_ctrl,lty=2) | |
# c) replicate figure 9.8 | |
library(arm) | |
lm_c = lm(post.test ~ treatment + pre.test +treatment:pre.test,data=foo) | |
lm_c_sim = sim(lm_c, n.sims=20) | |
plot (0, 0, xlim=range(foo$pre.test), ylim=c(-5,15), | |
xlab="Pre-test", ylab="Treatment effect", | |
main="Treatment effect for Grade 4") | |
abline (0, 0, lwd=.5, lty=2) | |
x = range(foo$pre.test) | |
for (i in 1:20){ | |
curve(lm_c_sim@coef[i,2] + lm_c_sim@coef[i,4]*x, lwd=.5, col="gray", | |
add=TRUE)} | |
curve(coef(lm_c)[2] + coef(lm_c)[4]*x, lwd=.5, add=TRUE) | |
legend(x=105,y=15,legend=c('Main regression','Simulated Results'), lty=1,col=c('black','grey')) | |
# 95% confidence interval for the treatment effect | |
pre_test = c(78:120) | |
num_sims=100 | |
storage_matrix = matrix(NA, nrow=length(pre_test), ncol=num_sims) | |
lm_c_sim = sim(lm_c,n.sims=num_sims) | |
for (score in pre_test){ | |
for (i in 1:num_sims){ | |
storage_matrix[score-77,i] = lm_c_sim@coef[i,2] + lm_c_sim@coef[i,4]*score | |
} | |
} | |
conf.int <- apply(storage_matrix, 1, quantile, probs = c(0.025, 0.975)) | |
lines(pre_test,conf.int[1,],lwd=0.5,col='red') | |
lines(pre_test,conf.int[2,],lwd=0.5,col='red') | |
legend(x=105,y=15,legend=c('Main regression','Simulated Results', '95% CI'), lty=1,col=c('black','grey', 'red')) | |
# The uncertainty around the treatment effect is lowest around the x-values of 100-110 | |
# Since this is where most observation lie, so most instances of simulated regression | |
# lines will be pulled towards the data in this area, decreasing the variation. | |
############## | |
# QUESTION 2: | |
# Using the “tinting” data set in the “DAAG” package, fit a linear model that predicts csoa as a function of age, sex, target, | |
# and I(tint != "no"), and I(tint!= "no"*age). Estimate: | |
############## | |
library(arm) | |
tinting = read.csv(url("https://tinyurl.com/v4bq99k")) | |
# a) the 95% interval of expected values for csoa, for typical treated female units, with target == “hicon”, ages 20, 30, 40, | |
# 50, 60, 70, and 80 using simulation (i.e., 1000 simulated predictions for every row from 1000 sets of coefficients). | |
# In this data set, a “treated unit” is a unit for which tint != “no” is TRUE. | |
# Fit the regression model | |
lm_tint <- | |
lm(csoa ~ age + sex + target + I(as.numeric(tint != "no")) + I(as.numeric(tint!= "no")*age), data = tinting) | |
summary(lm_tint) | |
# Simulate | |
iterations <- 1000 | |
sim <- sim(lm_tint, n.sims = iterations) | |
# Predict csoa for every unit of age | |
ages = c(20, 30, 40, 50, 60, 70, 80) | |
simulated.trt <- matrix(NA, nrow = iterations, length(ages)) | |
for (age in ages) { | |
Xs <- c(1, age, 0, 1, 1, 1*age) # 1 = intercept, age = age, 0 = sex(f), 1 = target(hicon), 1 = tint!='no' (treatment) | |
for (i in 1:iterations) { | |
simulated.trt[i, 1 + (age-min(ages))/10] <- sum(Xs*sim@coef[i,]) | |
} | |
} | |
# Table with the relevant point estimates (bounds of the prediction intervals of y for different ages) | |
conf.intervals.trt <- apply(simulated.trt, 2, quantile, probs = c(0.025, 0.975)) | |
mean.trt <- apply(simulated.trt, 2, mean) | |
table.mean.trt <- t(data.frame(conf.intervals.trt)) | |
colnames(table.mean.trt) <- c("Mean CI Lower Bound", "Mean CI Upper Bound") | |
rownames(table.mean.trt) <- ages | |
View(table.mean.trt) | |
# Plot the predicted csoa (y-axis) against age (x-axis) | |
plot(x = c(1:100), y = c(1:100), type = "n", | |
xlim = c(min(ages),max(ages)), | |
ylim = c(0,100), | |
main = "csoa by age for typical treated female units", xlab = "age", | |
ylab = "csoa") | |
for (age in ages) { | |
segments( | |
x0 = age, | |
y0 = conf.intervals.trt[1, 1 + (age-min(ages))/10], | |
x1 = age, | |
y1 = conf.intervals.trt[2, 1 + (age-min(ages))/10], | |
lwd = 1) | |
points(ages, mean.trt, col = "red", pch=16) | |
} | |
# b) the 95% prediction interval for the treatment effect, for typical female units. Please follow the instructions in (a), | |
# using simulation. | |
# Predict csoa for every unit of age | |
ages = c(20, 30, 40, 50, 60, 70, 80) | |
simulated.ctrl <- matrix(NA, nrow = iterations, ncol = length(ages)) | |
for (age in ages) { | |
Xs <- c(1, age, 0, 1, 1, 0*age) # 1 = intercept, age = age, 0 = sex(f), 1 = target(hicon), 1 = tint!='no', 0 = control unit | |
for (i in 1:iterations) { | |
simulated.ctrl[i, 1 + (age-min(ages))/10] <- sum(Xs*sim@coef[i,]) | |
} | |
} | |
simulated.trt.effect = simulated.trt - simulated.ctrl | |
# Table with the relevant point estimates (bounds of the prediction intervals of y for different ages) | |
conf.intervals.trt.effect <- apply(simulated.trt.effect, 2, quantile, probs = c(0.025, 0.975)) | |
mean.trt.effect <- apply(simulated.trt.effect, 2, mean) | |
table.mean.trt.effect <- t(data.frame(conf.intervals.trt.effect)) | |
colnames(table.mean.trt.effect) <- c("Mean CI Lower Bound", "Mean CI Upper Bound") | |
rownames(table.mean.trt.effect) <- ages | |
View(table.mean.trt.effect) | |
# Plot the predicted csoa (y-axis) against age (x-axis) | |
plot(x = c(1:100), y = c(1:100), type = "n", | |
xlim = c(min(ages),max(ages)), | |
ylim = c(-20,20), | |
main = "treatment effect by age for typical female units", xlab = "age", | |
ylab = "treatment effect") | |
for (age in ages) { | |
segments( | |
x0 = age, | |
y0 = conf.intervals.trt.effect[1, 1 + (age-min(ages))/10], | |
x1 = age, | |
y1 = conf.intervals.trt.effect[2, 1 + (age-min(ages))/10], | |
lwd = 1) | |
points(ages, mean.trt.effect, col = "red", pch=16) | |
} | |
############## | |
# QUESTION 3: | |
# Write your own function (8 lines max) that takes Ys and predicted Ys as inputs, and outputs R2. | |
# Copy/paste an example using the lalonde data (from the Matching library) that shows your code is working correctly -- | |
# (i.e., show that the summary(lm()) command outputs a very similar R2 as your own function does. | |
############## | |
# R squared function with bootstrapping | |
rsquared <- function(ytrue, ypred) { | |
res <- sum((ytrue - ypred)**2) | |
tot <- sum((ytrue - mean(ytrue))**2) | |
return(1-res/tot) | |
} | |
# Apply function to lalonde dataset | |
library(Matching) | |
data(lalonde) | |
lm.test = lm(age ~ educ, data = lalonde) | |
rsquared(lalonde$age, lm.test$fitted.values) | |
summary(lm.test)$r.sq | |
############## | |
# QUESTION 4: | |
# Bootstrap the 95% confidence interval of the treatment effect using the mazedata1.dta | |
# data (define the binary treatment condition as “Caste Revealed” vs. not “Caste | |
# Revealed, and define outcomes as “round1”. No covariates (independent variables) are | |
# needed. Use 10000 bootstrap samples. | |
#Don’t use a ‘canned’ bootstrap function -- please code the bootstrap routine manually. | |
#Then, obtain the a conventional confidence interval for the treatment effect using the | |
#“confint(lm())” function. | |
############## | |
library(foreign) | |
# Load the dta file from your local folder | |
foo4 = read.dta("C:/Users/tomkr/Documents/Minerva/Work Study/Buenos Aires/mazedata1.dta") | |
# Create a new treatment indicator for Caste Revealed = Treatment, the rest in Control | |
new_trt = rep(0,nrow(foo4)) | |
new_trt[which(foo4$treatment=='Caste Revealed')] = 1 | |
foo4$trt = new_trt | |
# Bootstrapping function | |
iterations <- 10000 | |
storage <- rep(NA, iterations) | |
for (i in 1:iterations) { | |
# Fit a linear model using the outcome variable and the newly-defined treatment indicator | |
lm.maze = lm(round1 ~ trt, data = foo4[sample(1:nrow(foo4), nrow(foo4), replace = T),]) | |
storage[i] <- lm.maze$coefficients[2] | |
} | |
# Find simulated confidence interval | |
quantile(storage, c(0.025, 0.975)) | |
# Find analytical confidence interval | |
lm.maze = lm(round1 ~ trt, data = foo4) | |
confint(lm.maze)[2,] | |
# Visualize simulated and analytical CI with table and histogram | |
# a) A table with the relevant results (bounds on the 2 confidence intervals). | |
data.frame("simulation" = quantile(storage, c(0.025, 0.975)), "analytical" = confint(lm.maze)[2,]) | |
# b) 1 histogram (properly labeled) showing your bootstrap-sample results. How you do this one is up to you. | |
hist(storage, main = "Bootstrapped Values for the Treatment Effect", | |
xlab = "Treatment Effect", ylab = "Count") | |
abline(v=quantile(storage, c(0.025, 0.975)), col="red") | |
abline(v=confint(lm.maze)[2,], col="blue") | |
legend(0.5,1500, legend=c("bootstrap 95% confidence interval","analytical 95% confidence interval"), | |
col=c("red","blue"), lty=1:1, cex=0.5) | |
# c) No more than 3 sentences summarizing the results and drawing any conclusions you find relevant and interesting. | |
# Sample response: Coefficient of the treatment variable in the linear model indicates the treatment effect, ~ -0.132. | |
# The 95% confidence interval ranges from ~ -0.584 to ~ 0.319, which means we lack strong evidence about the direction | |
# of the treatment effect. Analytical and simulation approaches yielded similar CIs, which provided extra confidence in this conclusion. | |
############## | |
# QUESTION 5: | |
# Obtain the nsw.dta dataset from: foo <- read.csv(url("https://tinyurl.com/yx8tqf3k")). | |
# Randomly remove 2000 observations, and set it aside as a test set (or validation set). | |
# Use the remainder of the dataset to estimate 2 models of your choice of varying degrees of complexity | |
# (in terms of the number of variables and/or their interactions), with the dependent variable being “treat”. | |
# NOTE: re78 is not a predictor because it postdates the treatment. (In other words, it’s an outcome.) | |
# Use LOOCV to estimate the test-set error (MSE) for each of your models. Then compare these LOOCV test-set error | |
# estimates to the test-set error you obtain when you actually run your model on your 2000-row test set. | |
# Provide a table that summarizes the 4 different sets of numbers (2 LOOCV estimates, and 2 test set error rates). | |
# Explain your results in no more than a paragraph. | |
############## | |
# Taking dataset and subsetting dataset into training and testing dataset | |
foo <- read.csv(url("https://tinyurl.com/yx8tqf3k")) | |
set.seed(12345) | |
test_set_rows <- sample(1:length(foo$age), 2000, replace = FALSE) | |
test_foo = foo[test_set_rows,] | |
train_foo = foo[-test_set_rows,] | |
# 2 models of varying complexity | |
glm1 = glm(treat ~ age, data = train_foo) | |
glm2 = glm(treat ~ . -re78 + age*education + education*nodegree + married*re74 + hispanic*re75, data = train_foo) | |
# Calculate MSE using test set | |
MSE1 = mean((test_foo$treat - predict.lm(glm1, test_foo))**2) | |
MSE2 = mean((test_foo$treat - predict.lm(glm2, test_foo))**2) | |
# Estimate MSE using LOOCV | |
library(boot) | |
LOOCV1 = cv.glm(train_foo, glm1)$delta[1] | |
LOOCV2 = cv.glm(train_foo, glm2)$delta[1] | |
TENCV1 = cv.glm(train_foo, glm1, K = 10)$delta[1] | |
TENCV2 = cv.glm(train_foo, glm2, K = 10)$delta[1] | |
# Table that summarizes the 5 LOOCV estimates and 5 test set MSEs | |
data.frame("Test set MSE" = c(MSE1, MSE2), "LOOCV MSE" = c(LOOCV1, LOOCV2), "TENCV MSE" = c(TENCV1, TENCV2)) | |
# Test set MSE LOOCV MSE TENCV | |
# simple model 0.009836669 0.011449698 0.0114505923 | |
# complex model 0.008975607 0.009969941 0.0099679393 | |
# Sample response: Comparing the test set MSEs and the LOOCV-approximated MSEs, we see that LOOCV does a decent approximation | |
# of the MSE (~0.002 difference). LOOCV did overapproximate the MSE in both cases. However, it may be okay in practice, because | |
# it doesn't give the impression that the model is better than it actually is. The trade-off of using LOOCV is that it is very | |
# computationally and temporally expensive (takes a lot of memory and time to run). Additionally, the more complex model | |
# gives a better MSE and LOOCV value. | |
############## | |
# QUESTION 6: | |
# Should briefly describe the the purpose of the | |
# study, the methodology, and the policy implications/prescription. | |
# Sample response: | |
# This memo presents the results from the Anti-Binge Consumption (ABC) experiment that | |
# tested a proposed program to reduce alcohol drinking among young adults aged 18 to 30. | |
# Participants were sampled at several regional community centers and colleges and were | |
# randomly assigned to either a series of lectures on Giant Pandas conservation (control, | |
# n=108 or short one-on-one conversations with current and ex-alcohol addicts (treatment, | |
# n=112). Both groups reported their dietary habits, including weekly alcohol consumption, | |
# before and a month after the program. The results show that while before the experiment | |
# weekly alcohol consumption was similar, post-experiment decrease in consumption was either | |
# conclusive or not depending on age group. Consumption was statistically significantly lower | |
# for 18-year-olds (95% intervals of 0-2 vs 3-10 drinks for treated and controls groups, respectively), | |
# borderline significant for 20-year-olds and inconclusive for ages 22-26. | |
# Consumption of treatment participants was rougly around the same range (roughly 0-4) for all | |
# ages while among control participants it got lower with age increase such that the differences | |
# diminished around ages 28-30. Thus, the results support administering the program to adults | |
# aged 18-20, whereas more research is necessary to understand the effects on the 22-26 age group. | |
############## | |
# QUESTION 7 (Optional): | |
# Figure out how to use the rgenoud genetic algorithm (you’ll have to install the rgenoud library) to run a least-squares | |
# regression. You’ll have to write a function that calculates the sum of the squared residuals, and then you will feed that | |
# function to rgenoud. Demonstrate that your genetic algorithm obtains the same (or very nearly the same) coefficients as | |
# using the lm function. | |
# Then, once you’ve established that your code is working correctly, write another function to run a least-absolute-deviations | |
# regression (in which the genetic algorithm identifies the linear function that produces the smallest sum of ABSOLUTE | |
# residuals). NOTE: You must add an argument “gradient.check = FALSE” to the genetic algorithm to ensure that it runs properly | |
# (You are not expected to know or explain why). | |
# Comment on the difference between the output (slope and intercept) of the two functions. Is one function more sensitive | |
# to the existing outlier? | |
############## | |
library(rgenoud) | |
data_outlier = read.csv(url("https://tinyurl.com/vfsoe4q")) | |
# Function to identify coefs that minimize the sum of squared residuals: | |
# We're going to assume the regression of interest is: | |
# Y ~ intercept + slope*X | |
SSR <- function(coef, y, x) { | |
y = data_outlier$y | |
x = data_outlier$x | |
# for a given set of coefs, the function calculates the sum of squared residuals | |
intercept = coef[1] | |
slope = coef[2] | |
return(sum((y - intercept - slope*x)**2)) | |
} | |
# Rgenoud genetic algorithm to minimize SSR | |
GA_1 = genoud(SSR, nvars = 2, pop.size=1000, max.generations=100, wait.generations=10) | |
#Parameters at the Solution (parameter, gradient): | |
# | |
# X[ 1] : 1.893287e-02 G[ 1] : 1.646590e-09 | |
# X[ 2] : 4.628064e-01 G[ 2] : -1.179106e-09 | |
# Simple regression | |
lm = lm(y ~ x, data = data_outlier) | |
summary(lm) | |
# Coefficients: | |
# Estimate Std. Error t value Pr(>|t|) | |
# (Intercept) 0.01893 0.11321 0.167 0.867523 | |
# x 0.46281 0.12273 3.771 0.000277 *** | |
SSAR <- function(coef, y, x) { | |
y = data_outlier$y | |
x = data_outlier$x | |
# for a given set of coefs, the function calculates the sum of absolute residuals | |
intercept = coef[1] | |
slope = coef[2] | |
return(sum(abs(y - intercept - slope*x))) | |
} | |
GA_2 = genoud(SSAR, nvars = 2, pop.size=1000, max.generations=100, wait.generations=10, gradient.check = FALSE) | |
#Parameters at the Solution (parameter, gradient): | |
# | |
# X[ 1] : -1.781644e-01 G[ 1] : 1.000000e+00 | |
# X[ 2] : 3.543404e-01 G[ 2] : 3.829967e-01 | |
# Sample response: Comparing the slopes that each optimizer generated with their respective loss functions, we observe that | |
# the SSR function has a more positive slope. This is because the SSR cost function "punishes" outliers more than the SSAR | |
# function. Thus, the slope will change more drastically to the presence of outliers to minimize the cost function. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment