Skip to content

Instantly share code, notes, and snippets.

@diamonaj
Created September 19, 2018 10:58
Show Gist options
  • Select an option

  • Save diamonaj/3b13b91aeb7f8584c3de259b980eeafb to your computer and use it in GitHub Desktop.

Select an option

Save diamonaj/3b13b91aeb7f8584c3de259b980eeafb to your computer and use it in GitHub Desktop.
# Regression 101
# CLAIM:
# For regression with a single binary predictor,
# the regression coefficient on the predictor is the difference
# between the averages of the two groups.
# TASK ONE:
# Decide if this claim is true/false.
# Work together as a team to perform analysis and/or creates a
# data visualization on "haha.RData" to show it as true/false.
# The binary predictor you should use: "treatment" variable.
# The outcome variable (dependent variable) is "post.test".
lm1 <- lm(post.test ~ treatment)
lm1$coef
mean(post.test[treatment == 1]) - mean(post.test[treatment==0])
plot(treatment, post.test, pch = 20, col = "purple4", cex = 1.2)
points(0, mean(post.test[treatment==0]), col = "green", pch = 20, cex = 3)
points(1, mean(post.test[treatment==1]), col = "green", pch = 20, cex = 3)
abline(lm1, col = "red", lwd = 1.2)
# TASK TWO:
# In this simple case, what does the intercept represent?
# TASK THREE: Calculate the following--
# Average error (discrepancy) using absolute values
# RSS
# Residual error (check your work using "summary()" function)
# R^2 (check your work using "summary()" on your model)
mean(abs(predict(lm1) - post.test)) # 4.26
sqrt(mean((predict(lm1) - post.test)^2)) # 5.807
sqrt(sum((predict(lm1) - post.test)^2)/(length(post.test)-2)) # 5.951
numerator_Rsquared <- sum((predict(lm1) - post.test)^2)
denominator_Rsquared <- sum((mean(post.test) - post.test)^2)
1 - numerator_Rsquared/denominator_Rsquared # 0.09255
# TASK FOUR: Now run a multivariate regression, using the
# other continuous predictor. Use "summary()" and compare
# RMSE, R^2, & Residual Error here vs. the univariate case above.
lm2 <- lm(post.test ~ pre.test + treatment)
summary(lm2)
# TASK FIVE
# RUN TWO SEPARATE REGRESSIONS, FOR TWO GROUPS:
# THE TREATMENT GROUP, and THE CONTROL GROUP.
# PLOT BOTH LINES ON THE SAME AXIS.
# YOU MAY FIND IT HELPFUL TO INCLUDE: xlim = c(50, 150), ylim = c(50, 150)
# SEE THE PROFESSOR'S PIAZZA POST ON THIS TOPIC FOR HELP...
# WHAT DOES THIS FINAL ANALYSIS SUGGEST, ESP IF
# TREATMENT WAS RANDOMLY ASSIGNED?
pre.test <- pre.test[order(pre.test)]
post.test <- post.test[order(pre.test)]
lm.t <- lm(post.test[treatment == 1] ~ pre.test[treatment == 1])
lm.c <- lm(post.test[treatment == 0] ~ pre.test[treatment == 0])
plot(x = pre.test[treatment==1], y = post.test[treatment == 1], col = "red",
xlab = "pre.test", ylab = "post.test", xlim = c(50, 150), ylim = c(50, 150))
points(x = pre.test[treatment==0], y = post.test[treatment ==0], col = "blue")
abline(lm.t, col = "red", lwd = 1.3)
abline(lm.c, col = "blue", lwd = 1.3)
loess.t <- loess(post.test[treatment == 1] ~ pre.test[treatment == 1], span = 0.7)
lines(pre.test[treatment==1], predict(loess.t, pre.test[treatment==1]), lty = 3, lwd = 3, col = "red")
loess.t <- loess(post.test[treatment == 0] ~ pre.test[treatment == 0], span = 0.7)
lines(pre.test[treatment==0], predict(loess.t, pre.test[treatment==0]), lty = 3, lwd = 3, col = "blue")
library(Matching)
data(lalonde)
attach(lalonde) # by using 'attach()', we don't have to use the $ notation
# R now knows where to find the variables--they are in working memory
# I tend NOT to do this, b/c there you can get yourself in trouble...
# but here it makes sense.
re74 <- re74[order(re74)] # just to put the indep var in order for when we draw lines later
re78 <- re78[order(re74)] # if we order x, then we also should order y the same way
# consider 'married' to be the treatment
lm.t <- lm(re78[married == 1] ~ re74[married == 1])
lm.c <- lm(re78[married == 0] ~ re74[married == 0])
# create the plot and add points in red to show the married people
plot(x = re74[married==1], y = re78[married == 1], col = "red",
xlab = "earnings in 74", ylab = "earnings in 78",
xlim = c(0, 1.1*max(re74)), ylim = c(0, 1.1*max(re78)))
# add points in blue to show the unmarried people
points(x = re74[married==0], y = re78[married ==0], col = "blue")
# plot the regression lines for married and unmarried people...
abline(lm.t, col = "red", lwd = 1.3)
abline(lm.c, col = "blue", lwd = 1.3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment