Created
September 19, 2018 10:58
-
-
Save diamonaj/3b13b91aeb7f8584c3de259b980eeafb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Regression 101 | |
| # CLAIM: | |
| # For regression with a single binary predictor, | |
| # the regression coefficient on the predictor is the difference | |
| # between the averages of the two groups. | |
| # TASK ONE: | |
| # Decide if this claim is true/false. | |
| # Work together as a team to perform analysis and/or creates a | |
| # data visualization on "haha.RData" to show it as true/false. | |
| # The binary predictor you should use: "treatment" variable. | |
| # The outcome variable (dependent variable) is "post.test". | |
| lm1 <- lm(post.test ~ treatment) | |
| lm1$coef | |
| mean(post.test[treatment == 1]) - mean(post.test[treatment==0]) | |
| plot(treatment, post.test, pch = 20, col = "purple4", cex = 1.2) | |
| points(0, mean(post.test[treatment==0]), col = "green", pch = 20, cex = 3) | |
| points(1, mean(post.test[treatment==1]), col = "green", pch = 20, cex = 3) | |
| abline(lm1, col = "red", lwd = 1.2) | |
| # TASK TWO: | |
| # In this simple case, what does the intercept represent? | |
| # TASK THREE: Calculate the following-- | |
| # Average error (discrepancy) using absolute values | |
| # RSS | |
| # Residual error (check your work using "summary()" function) | |
| # R^2 (check your work using "summary()" on your model) | |
| mean(abs(predict(lm1) - post.test)) # 4.26 | |
| sqrt(mean((predict(lm1) - post.test)^2)) # 5.807 | |
| sqrt(sum((predict(lm1) - post.test)^2)/(length(post.test)-2)) # 5.951 | |
| numerator_Rsquared <- sum((predict(lm1) - post.test)^2) | |
| denominator_Rsquared <- sum((mean(post.test) - post.test)^2) | |
| 1 - numerator_Rsquared/denominator_Rsquared # 0.09255 | |
| # TASK FOUR: Now run a multivariate regression, using the | |
| # other continuous predictor. Use "summary()" and compare | |
| # RMSE, R^2, & Residual Error here vs. the univariate case above. | |
| lm2 <- lm(post.test ~ pre.test + treatment) | |
| summary(lm2) | |
| # TASK FIVE | |
| # RUN TWO SEPARATE REGRESSIONS, FOR TWO GROUPS: | |
| # THE TREATMENT GROUP, and THE CONTROL GROUP. | |
| # PLOT BOTH LINES ON THE SAME AXIS. | |
| # YOU MAY FIND IT HELPFUL TO INCLUDE: xlim = c(50, 150), ylim = c(50, 150) | |
| # SEE THE PROFESSOR'S PIAZZA POST ON THIS TOPIC FOR HELP... | |
| # WHAT DOES THIS FINAL ANALYSIS SUGGEST, ESP IF | |
| # TREATMENT WAS RANDOMLY ASSIGNED? | |
| pre.test <- pre.test[order(pre.test)] | |
| post.test <- post.test[order(pre.test)] | |
| lm.t <- lm(post.test[treatment == 1] ~ pre.test[treatment == 1]) | |
| lm.c <- lm(post.test[treatment == 0] ~ pre.test[treatment == 0]) | |
| plot(x = pre.test[treatment==1], y = post.test[treatment == 1], col = "red", | |
| xlab = "pre.test", ylab = "post.test", xlim = c(50, 150), ylim = c(50, 150)) | |
| points(x = pre.test[treatment==0], y = post.test[treatment ==0], col = "blue") | |
| abline(lm.t, col = "red", lwd = 1.3) | |
| abline(lm.c, col = "blue", lwd = 1.3) | |
| loess.t <- loess(post.test[treatment == 1] ~ pre.test[treatment == 1], span = 0.7) | |
| lines(pre.test[treatment==1], predict(loess.t, pre.test[treatment==1]), lty = 3, lwd = 3, col = "red") | |
| loess.t <- loess(post.test[treatment == 0] ~ pre.test[treatment == 0], span = 0.7) | |
| lines(pre.test[treatment==0], predict(loess.t, pre.test[treatment==0]), lty = 3, lwd = 3, col = "blue") | |
| library(Matching) | |
| data(lalonde) | |
| attach(lalonde) # by using 'attach()', we don't have to use the $ notation | |
| # R now knows where to find the variables--they are in working memory | |
| # I tend NOT to do this, b/c there you can get yourself in trouble... | |
| # but here it makes sense. | |
| re74 <- re74[order(re74)] # just to put the indep var in order for when we draw lines later | |
| re78 <- re78[order(re74)] # if we order x, then we also should order y the same way | |
| # consider 'married' to be the treatment | |
| lm.t <- lm(re78[married == 1] ~ re74[married == 1]) | |
| lm.c <- lm(re78[married == 0] ~ re74[married == 0]) | |
| # create the plot and add points in red to show the married people | |
| plot(x = re74[married==1], y = re78[married == 1], col = "red", | |
| xlab = "earnings in 74", ylab = "earnings in 78", | |
| xlim = c(0, 1.1*max(re74)), ylim = c(0, 1.1*max(re78))) | |
| # add points in blue to show the unmarried people | |
| points(x = re74[married==0], y = re78[married ==0], col = "blue") | |
| # plot the regression lines for married and unmarried people... | |
| abline(lm.t, col = "red", lwd = 1.3) | |
| abline(lm.c, col = "blue", lwd = 1.3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment