Last active
February 22, 2020 17:05
-
-
Save anglilian/ed11b3cfd73a31af8fef52d5b7ed7f60 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(arm) | |
###QUESTION 1### | |
##PART A## | |
#Loading data from the CSV file | |
sesame <- read.csv("https://tinyurl.com/w1g163b") | |
#Splitting the data into two datasets with control and treatment data | |
sesame_treat <- subset(sesame, sesame$treatment == 1) | |
sesame_cont <- subset(sesame, sesame$treatment == 0) | |
#Creating a linear regression model for treatment and control data | |
lm_pre_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat) | |
lm_pre_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont) | |
#Plotting the data for pre test and post test along with the respective regression lines | |
plot(x = sesame_cont$pre.test, y = sesame_cont$post.test, main = "Grade 4" , | |
pch = 16, xlab = "Pre test", ylab = "Post test", xlim= c(0, max(sesame$pre.test)), ylim = c(0,max(sesame$post.test))) | |
lines (x = sesame_treat$pre.test, y=sesame_treat$post.test, type = "p") | |
abline(lm_pre_treat) | |
abline(lm_pre_cont, lty= 'dashed') | |
##PART B## | |
sesame_edit <- read.csv("https://tinyurl.com/w1g163b") | |
#Change point 4 from post test to 45 from 104.6. | |
sesame_edit[5, 1] = 45 | |
#Splitting the data into two datasets with control and treatment data | |
sesame_treat_edit <- subset(sesame_edit, sesame_edit$treatment == 1) | |
sesame_cont_edit <- subset(sesame_edit, sesame_edit$treatment == 0) | |
#Creating a linear regression model for treatment and control data | |
lm_treat <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data = sesame_treat_edit) | |
lm_cont <- lm(post.test ~ pre.test + treatment:pre.test + treatment , data=sesame_cont_edit) | |
#Plotting the data for pre test and post test along with the respective regression lines | |
plot(x = sesame_treat_edit$pre.test, y=sesame_treat_edit$post.test, main = "Grade 4" , | |
xlab = "Pre test", ylab = "Post test", col=ifelse(sesame_edit$post.test==45.0, "red", "black"), pch=ifelse(sesame_edit$post.test==45.0,16, 1) ) | |
lines (x = sesame_cont_edit$pre.test, y = sesame_cont_edit$post.test, | |
type = "p", pch = 16 | |
) #changes the colour of the point to red | |
abline(lm_treat) | |
abline(lm_cont, lty= 'dashed') | |
##PART C## | |
#Create a linear regression model for the post test score | |
lm.4 <- lm(post.test ~ treatment + pre.test + treatment:pre.test, | |
data = sesame) | |
1#simulate the coefficients of the model | |
lm.4.sim <- sim(lm.4) | |
lm.4.sim | |
# creating an empty plot | |
plot(0, 0, xlim= range(sesame$pre.test), ylim = c(-5,10), | |
xlab = "Pre-test", ylab = "Treatment effect", | |
main = "Treatment Effect in Grade 4") | |
#adds a dotted line along the x-axis | |
abline (0,0, lwd = .5, lty =2) | |
#adds 20 lines to the plot using the simulated coefficients | |
for (i in 1:20){ | |
curve(lm.4.sim@coef[i,2]+lm.4.sim@coef[i,4]*x, | |
lwd=.5, col="gray", add= TRUE)} | |
#adds a line that is the difference between the two regression lines in the previous plot | |
curve(coef(lm.4)[2]+coef(lm.4)[4]*x, lwd=.5, add=TRUE) | |
#histogram of pre test scores | |
hist(sesame$pre.test) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###QUESTION 2### | |
tinting = read.csv(url("https://tinyurl.com/v4bq99k")) #loading data | |
lm.tint <- lm(csoa~ age + sex + target + I(tint!="no") + | |
I(as.numeric(tint!="no")*age), data = tinting) #setting the linear model | |
#Simulating 95% interval and mean of csoa | |
set.seed(123) #setting randomness | |
iteration <- 1000 #number of simulations to run | |
sim_treated <-sim(lm.tint, n.sims=iteration) #running the simulations 1000 times on linear model | |
agevector <- c(20,30,40,50,60,70,80) | |
storage.matrix <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age | |
for (age in agevector){ #iterating through each age in age vector | |
for (i in 1:1000) { #1000 times for each age | |
treated <- c(1, age, 0, 0, TRUE, as.numeric(TRUE)*age) #setting the variables as stipulated | |
storage.matrix[i, age/10-1] <-sum(treated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa | |
} | |
} | |
quantile(storage.matrix[,7], c(0.025,0.975)) | |
#Creating a table of results | |
means <- apply(storage.matrix, 2, mean) #computing mean of each age | |
conf.intervals <- apply(storage.matrix, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age | |
table_ages <- rbind(means, conf.intervals) #putting data together in one table | |
colnames(table_ages) <- c(20,30,40,50,60,70,80) #change column names to appropriate age | |
table_ages | |
#Plotting results onto a graph | |
plot(x = c(1:100), xlim= c(20,80) , ylim = c(30,70), type = "n", xlab = "Age (Years)", | |
ylab = "CSOA", main= "Expected Values of Treated Females") | |
for (age in agevector) { | |
segments( #plots the confidence intervals of each age | |
x0 = age, | |
y0 = conf.intervals[1, age/10-1], | |
x1 = age, | |
y1 = conf.intervals[2, age/10-1], | |
lwd = 2) | |
} | |
lines(agevector, means, pch=16, col='red', type='p') #plot the means of each age | |
##PART B## | |
storage.matrix.cont <- matrix(NA, nrow=1000, ncol = 7) #creating a matrix to store the 1000 csoa for each age | |
#Simulating csoa values for control group | |
for (age in agevector){ #iterating through each age in age vector | |
for (i in 1:1000) { #1000 times for each age | |
untreated <- c(1, age, 0, 0, 0 , as.numeric(FALSE)*age) #setting the variables as stipulated | |
storage.matrix.cont[i, age/10-1] <-sum(untreated*sim_treated@coef[i,]) #multiplying each variable by its simulated coefficient, summing it and storing it in a matrix as the predicted csoa | |
} | |
} | |
#calculating the treatment effect for each entry by subtracting the treated from the controlled | |
storage.treat <- storage.matrix - storage.matrix.cont | |
#Creating a table of results | |
means.treat <-apply(storage.treat, 2, mean) #computing mean of each age | |
conf.intervals.treat <- apply(storage.treat, 2, quantile, probs = c(0.025, 0.975)) #computing 2.5 and 97.5 percentile of each age | |
table_ages.treat <- rbind(means.treat, conf.intervals.treat) #putting data together in one table | |
colnames(table_ages.treat) <- c(20,30,40,50,60,70,80) #change column names to appropriate age | |
table_ages.treat | |
#Plotting results onto a graph | |
plot(x = c(1:100), xlim= c(20,80) , ylim = c(-10,10), type = "n", xlab = "Age (Years)", | |
ylab = "CSOA", main= "Treatment Effect of Females") | |
for (age in agevector) { | |
segments( | |
x0 = age, | |
y0 = conf.intervals.treat[1, age/10-1], | |
x1 = age, | |
y1 = conf.intervals.treat[2, age/10-1], | |
lwd = 2) | |
} | |
lines(agevector, means.treat, pch=16, col='red', type='p') #plotting each of the means |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###QUESTION 3### | |
#Function for R squared that takes the two arguments: y and the predicted y | |
rsquare <- function(y, yhat) { | |
sumsquaredregression <- 0 | |
totalsumofsquares <- 0 | |
for (i in 1:length(y)) { | |
sumsquaredregression <- sumsquaredregression + (y[i]-yhat[i])^2 #sum of squared regression | |
totalsumofsquares <- totalsumofsquares + (y[i]-mean(y))^2 #total sum of squares | |
} | |
print (paste("The R-squared value is", | |
1-(sumsquaredregression/totalsumofsquares))) #equation for R squared | |
} | |
#loading data for lalonde | |
library(Matching) | |
data(lalonde) | |
#creating a linear model for lalonde to predict for re78 | |
lm.lalonde <- lm(re78 ~., data= lalonde) | |
yhat <- predict(lm.lalonde) | |
#Output r-squared values | |
rsquare(lalonde$re78, yhat) | |
summary(lm.lalonde)$r.squared |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##Question 4### | |
#loading data | |
library(foreign) | |
maze <- read.dta("mazedata1.dta") | |
#setting the treatment to be binary where Caste Revealed is 1 and 0 otherwise | |
treat <- ifelse(maze$treatment == "Caste Revealed", 1, 0) | |
maze <- data.frame(maze, treat) #appending binary version of treatment to dataframe | |
#holds all the round1 values for treatment group | |
treated <- maze$round1[which(maze$treat==1)] | |
#holds all the round1 values for control group | |
control <- maze$round1[which(maze$treat==0)] | |
#create storage list | |
store <- c() | |
#bootstrap the treatment effect for 10,000 samples | |
for (x in 1:10000) { | |
store[x] <- mean(sample(treated, length(treated), replace = TRUE)) - | |
mean(sample(control, length(control), replace = TRUE)) #subtract the control from the treatment | |
} | |
#store lower and upper boundary of confidence interval of the bootstrapped treatment effects | |
confidenceinterval <- quantile(store, c(0.025,0.975)) | |
#linear model with round1 as dependent variable and treatment as independent variable | |
lm.maze = lm(round1 ~ treat, data = maze) | |
#computing confidence interval using function based on linear model | |
confint <- rbind (confidenceinterval, confint(lm.maze)[2,]) #attach bootstrapped value in one table | |
confint | |
#create a probability density histogram of treatment effect | |
hist(store, xlab = "Treatment Effect", ylab= 'Probability Density', main= 'Probability Density of Treatment Effect', freq=FALSE) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###QUESTION 5### | |
#loading data | |
foo <- read.csv(url("https://tinyurl.com/yx8tqf3k")) | |
#setting randomness | |
set.seed(12345) | |
#removing 2000 rows randomly from the original data set | |
test_set_rows <- sample(1:length(foo$age), 2000, replace =FALSE) #sampling 2000 index numbers to removes | |
test_foo <- foo[-test_set_rows,] #removing the rows from the data set | |
#Building the model for surname A-F | |
glm.simple <- glm(treat~ age, data = test_foo) | |
glm.complex <- glm(treat~. -re78, data = test_foo) | |
library(boot) | |
##LOOCV | |
#using the cross validation function that by default sets the number of folds to 1 for LOOCV | |
LOOCV1 = cv.glm(test_foo, glm.simple)$delta[1] #simple model's MSE | |
LOOCV2 = cv.glm(test_foo, glm.complex)$delta[1] #complex model's MSE | |
##10-fold cross validation | |
#using the cross-validation function and setting the number of folds to 10 and averaging the 10 MSEs | |
cv.error.simple.10 = rep(0,10) #stores each CV result | |
for(i in 1:10) { #loops 10 times to attach each result | |
cv.error.simple.10[i]=cv.glm(test_foo, glm.simple, K=10)$delta[1] #performs 10-fold CV | |
} | |
mean(cv.error.simple.10) #computes mean for simple model | |
cv.error.complex.10 = rep(0,10) #stores each CV result | |
for(i in 1:10) { #loops 10 times to attach each result | |
cv.error.complex.10[i]=cv.glm(test_foo, glm.complex, K=10)$delta[1] #performs 10-fold CV | |
} | |
mean(cv.error.complex.10) #computes mean for complex model | |
##test set error | |
#subtracts the actual y value with the predicted y value from each model, squares each result and calculates the average | |
mean((foo[test_set_rows,]$treat -predict.glm(glm.simple, newdata=foo[test_set_rows,]))^2) #simple model | |
mean((foo[test_set_rows,]$treat -predict.glm(glm.complex, newdata=foo[test_set_rows,]))^2) #complex model |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###QUESTION 6 | |
trt = matrix(NA,nrow=2,ncol=7) | |
ctrl = matrix(NA,nrow=2,ncol=7) | |
trt[,1]=c(0, 2) #18 | |
ctrl[,1]=c(3, 10) | |
trt[,2]=c(0, 3) #20 | |
ctrl[,2]=c(2, 8) | |
trt[,3]=c(0, 4) #22 | |
ctrl[,3]=c(2, 7) | |
trt[,4]=c(1, 3) #24 | |
ctrl[,4]=c(2, 6) | |
trt[,5]=c(1, 3) #26 | |
ctrl[,5]=c(2, 5) | |
trt[,6]=c(1, 3) #28 | |
ctrl[,6]=c(2, 4) | |
trt[,7]=c(1, 2) #30 | |
ctrl[,7]=c(1, 3) | |
c1 = rgb(red = 1, green = 0, blue = 0, alpha = 0.5) #trt | |
c2 = rgb(red = 0, green = 0, blue = 1, alpha = 0.5) #ctrl | |
plot(x = c(1:100), y = c(1:100), type = "n", xlim = c(17,31), ylim = c(0,11), cex.lab=1.2, | |
main = "Alcohol Consumption - 95% Prediction Intervals", xlab = "Age",ylab = "Drinks per Week") | |
for (age in seq(from=18,to=30,by=2)) { | |
segments(x0 = age-0.05, y0 = trt[1, (age-18)/2+1], | |
x1 = age-0.05, y1 = trt[2, (age-18)/2+1],lwd = 3,col=c1) | |
segments(x0 = age+0.05, y0 = ctrl[1, (age-18)/2+1], | |
x1 = age+0.05, y1 = ctrl[2, (age-18)/2+1],lwd = 3,col=c2) | |
} | |
legend('topright',legend=c('Treatment','Control'),fill=c(c1,c2)) | |
mtext("https://tinyurl.com/vwxuwop", side = 1, cex = 0.5, adj = 0, padj = 10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment