diamonaj · September 19, 2018 10:58
diff --git a/2.2.R b/2.2.R
 # Regression 101

 # CLAIM:
 # For regression with a single binary predictor, 
 # the regression coefficient on the predictor is the difference 
 # between the averages of the two groups.

 # TASK ONE:
 # Decide if this claim is true/false.
 # Work together as a team to perform analysis and/or creates a 
 # data visualization on "haha.RData" to show it as true/false.
 # The binary predictor you should use: "treatment" variable.
 # The outcome variable (dependent variable) is "post.test".

 lm1 <- lm(post.test ~ treatment)
 lm1$coef

 mean(post.test[treatment == 1]) - mean(post.test[treatment==0])

 plot(treatment, post.test, pch = 20, col = "purple4", cex = 1.2)
 points(0, mean(post.test[treatment==0]), col = "green", pch = 20, cex = 3)
 points(1, mean(post.test[treatment==1]), col = "green", pch = 20, cex = 3)
 abline(lm1, col = "red", lwd = 1.2)


 # TASK TWO:
 # In this simple case, what does the intercept represent?

 # TASK THREE: Calculate the following--
 # Average error (discrepancy) using absolute values
 # RSS
 # Residual error (check your work using "summary()" function)
 # R^2 (check your work using "summary()" on your model)

 mean(abs(predict(lm1) - post.test)) # 4.26
 sqrt(mean((predict(lm1) - post.test)^2)) # 5.807
 sqrt(sum((predict(lm1) - post.test)^2)/(length(post.test)-2)) # 5.951

 numerator_Rsquared <- sum((predict(lm1) - post.test)^2)
 denominator_Rsquared <- sum((mean(post.test) - post.test)^2)
 1 - numerator_Rsquared/denominator_Rsquared # 0.09255

 # TASK FOUR: Now run a multivariate regression, using the 
 # other continuous predictor. Use "summary()" and compare
 # RMSE, R^2, & Residual Error here vs. the univariate case above.

 lm2 <- lm(post.test ~ pre.test + treatment)
 summary(lm2)

 # TASK FIVE
 # RUN TWO SEPARATE REGRESSIONS, FOR TWO GROUPS:
 # THE TREATMENT GROUP, and THE CONTROL GROUP.
 # PLOT BOTH LINES ON THE SAME AXIS.
 # YOU MAY FIND IT HELPFUL TO INCLUDE: xlim = c(50, 150), ylim = c(50, 150)
 # SEE THE PROFESSOR'S PIAZZA POST ON THIS TOPIC FOR HELP...
 # WHAT DOES THIS FINAL ANALYSIS SUGGEST, ESP IF 
 # TREATMENT WAS RANDOMLY ASSIGNED?

 pre.test <- pre.test[order(pre.test)]
 post.test <- post.test[order(pre.test)]


 lm.t <- lm(post.test[treatment == 1] ~ pre.test[treatment == 1])
 lm.c <- lm(post.test[treatment == 0] ~ pre.test[treatment == 0])

 plot(x = pre.test[treatment==1], y = post.test[treatment == 1], col = "red", 
     xlab = "pre.test", ylab = "post.test", xlim = c(50, 150), ylim = c(50, 150))
 points(x = pre.test[treatment==0], y = post.test[treatment ==0], col = "blue")
 abline(lm.t, col = "red", lwd = 1.3)
 abline(lm.c, col = "blue", lwd = 1.3)

 loess.t <- loess(post.test[treatment == 1] ~ pre.test[treatment == 1], span = 0.7)
 lines(pre.test[treatment==1], predict(loess.t, pre.test[treatment==1]), lty = 3, lwd = 3, col = "red")

 loess.t <- loess(post.test[treatment == 0] ~ pre.test[treatment == 0], span = 0.7)
 lines(pre.test[treatment==0], predict(loess.t, pre.test[treatment==0]), lty = 3, lwd = 3, col = "blue")

 library(Matching)
 data(lalonde)
 attach(lalonde) # by using 'attach()', we don't have to use the $ notation
                # R now knows where to find the variables--they are in working memory
                # I tend NOT to do this, b/c there you can get yourself in trouble...
                # but here it makes sense.

 re74 <- re74[order(re74)] # just to put the indep var in order for when we draw lines later
 re78 <- re78[order(re74)] # if we order x, then we also should order y the same way

 # consider 'married' to be the treatment
 lm.t <- lm(re78[married == 1] ~ re74[married == 1])
 lm.c <- lm(re78[married == 0] ~ re74[married == 0])

 # create the plot and add points in red to show the married people
 plot(x = re74[married==1], y = re78[married == 1], col = "red", 
     xlab = "earnings in 74", ylab = "earnings in 78", 
     xlim = c(0, 1.1*max(re74)), ylim = c(0, 1.1*max(re78)))

 # add points in blue to show the unmarried people
 points(x = re74[married==0], y = re78[married ==0], col = "blue")

 # plot the regression lines for married and unmarried people...
 abline(lm.t, col = "red", lwd = 1.3)
 abline(lm.c, col = "blue", lwd = 1.3)
	# Regression 101

	# CLAIM:
	# For regression with a single binary predictor,
	# the regression coefficient on the predictor is the difference
	# between the averages of the two groups.

	# TASK ONE:
	# Decide if this claim is true/false.
	# Work together as a team to perform analysis and/or creates a
	# data visualization on "haha.RData" to show it as true/false.
	# The binary predictor you should use: "treatment" variable.
	# The outcome variable (dependent variable) is "post.test".

	lm1 <- lm(post.test ~ treatment)
	lm1$coef

	mean(post.test[treatment == 1]) - mean(post.test[treatment==0])

	plot(treatment, post.test, pch = 20, col = "purple4", cex = 1.2)
	points(0, mean(post.test[treatment==0]), col = "green", pch = 20, cex = 3)
	points(1, mean(post.test[treatment==1]), col = "green", pch = 20, cex = 3)
	abline(lm1, col = "red", lwd = 1.2)


	# TASK TWO:
	# In this simple case, what does the intercept represent?

	# TASK THREE: Calculate the following--
	# Average error (discrepancy) using absolute values
	# RSS
	# Residual error (check your work using "summary()" function)
	# R^2 (check your work using "summary()" on your model)

	mean(abs(predict(lm1) - post.test)) # 4.26
	sqrt(mean((predict(lm1) - post.test)^2)) # 5.807
	sqrt(sum((predict(lm1) - post.test)^2)/(length(post.test)-2)) # 5.951

	numerator_Rsquared <- sum((predict(lm1) - post.test)^2)
	denominator_Rsquared <- sum((mean(post.test) - post.test)^2)
	1 - numerator_Rsquared/denominator_Rsquared # 0.09255

	# TASK FOUR: Now run a multivariate regression, using the
	# other continuous predictor. Use "summary()" and compare
	# RMSE, R^2, & Residual Error here vs. the univariate case above.

	lm2 <- lm(post.test ~ pre.test + treatment)
	summary(lm2)

	# TASK FIVE
	# RUN TWO SEPARATE REGRESSIONS, FOR TWO GROUPS:
	# THE TREATMENT GROUP, and THE CONTROL GROUP.
	# PLOT BOTH LINES ON THE SAME AXIS.
	# YOU MAY FIND IT HELPFUL TO INCLUDE: xlim = c(50, 150), ylim = c(50, 150)
	# SEE THE PROFESSOR'S PIAZZA POST ON THIS TOPIC FOR HELP...
	# WHAT DOES THIS FINAL ANALYSIS SUGGEST, ESP IF
	# TREATMENT WAS RANDOMLY ASSIGNED?

	pre.test <- pre.test[order(pre.test)]
	post.test <- post.test[order(pre.test)]


	lm.t <- lm(post.test[treatment == 1] ~ pre.test[treatment == 1])
	lm.c <- lm(post.test[treatment == 0] ~ pre.test[treatment == 0])

	plot(x = pre.test[treatment==1], y = post.test[treatment == 1], col = "red",
	xlab = "pre.test", ylab = "post.test", xlim = c(50, 150), ylim = c(50, 150))
	points(x = pre.test[treatment==0], y = post.test[treatment ==0], col = "blue")
	abline(lm.t, col = "red", lwd = 1.3)
	abline(lm.c, col = "blue", lwd = 1.3)

	loess.t <- loess(post.test[treatment == 1] ~ pre.test[treatment == 1], span = 0.7)
	lines(pre.test[treatment==1], predict(loess.t, pre.test[treatment==1]), lty = 3, lwd = 3, col = "red")

	loess.t <- loess(post.test[treatment == 0] ~ pre.test[treatment == 0], span = 0.7)
	lines(pre.test[treatment==0], predict(loess.t, pre.test[treatment==0]), lty = 3, lwd = 3, col = "blue")

	library(Matching)
	data(lalonde)
	attach(lalonde) # by using 'attach()', we don't have to use the $ notation
	# R now knows where to find the variables--they are in working memory
	# I tend NOT to do this, b/c there you can get yourself in trouble...
	# but here it makes sense.

	re74 <- re74[order(re74)] # just to put the indep var in order for when we draw lines later
	re78 <- re78[order(re74)] # if we order x, then we also should order y the same way

	# consider 'married' to be the treatment
	lm.t <- lm(re78[married == 1] ~ re74[married == 1])
	lm.c <- lm(re78[married == 0] ~ re74[married == 0])

	# create the plot and add points in red to show the married people
	plot(x = re74[married==1], y = re78[married == 1], col = "red",
	xlab = "earnings in 74", ylab = "earnings in 78",
	xlim = c(0, 1.1max(re74)), ylim = c(0, 1.1max(re78)))

	# add points in blue to show the unmarried people
	points(x = re74[married==0], y = re78[married ==0], col = "blue")

	# plot the regression lines for married and unmarried people...
	abline(lm.t, col = "red", lwd = 1.3)
	abline(lm.c, col = "blue", lwd = 1.3)
No results found