Created
November 15, 2016 15:42
-
-
Save diamonaj/2bb1b2eba762c2e89bb4d36906faaa75 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ### REGRESSION TO THE MEAN | |
| set.seed(12345) | |
| dev.off() | |
| ### Generate "TRUE ABILITY" | |
| # N people, all with "zero" ability | |
| N <- 20 | |
| true_ability <- rep(0, N) | |
| # their luck on two tests varies within set bounds | |
| upper_bound_luck <- 25 | |
| lower_bound_luck <- -upper_bound_luck # in this default case, minus 25 | |
| # luck on the first test (average luck is 0) | |
| luck1 <- round(runif(N, lower_bound_luck, upper_bound_luck)) | |
| # luck on the second test (average luck is 0) | |
| luck2 <- round(runif(N, lower_bound_luck, upper_bound_luck)) | |
| # TEST RESULTS | |
| observed_result1 <- true_ability + luck1 | |
| mean_result1 <- mean(observed_result1) | |
| observed_result2 <- true_ability + luck2 | |
| mean_result2 <- mean(observed_result2) | |
| # TO PLOT THE DATA | |
| plot(x = (1:N), y = observed_result1, col = "blue", pch = 19, cex = 0.75, | |
| xlab = "Person #", ylab = "Test 1 Result") | |
| points(x=1:N, y = observed_result2, col = "red", pch = 23, cex = 0.75) | |
| # horizontal dotted line at true average | |
| abline(h = 0, lty = 2) | |
| # HOW FAR FROM TRUE MEAN RESULT (zero--zero true ability & zero luck) | |
| distance1 <- abs(observed_result1) | |
| distance2 <- abs(observed_result2) | |
| # Did units with extreme values of distance 1 have less extreme values | |
| # for distance 2? | |
| what_is_extreme <- 18 | |
| extreme1 <- which(distance1 >= what_is_extreme) | |
| length(extreme1) | |
| points(x=c(1:100)[extreme1], y = observed_result1[extreme1], col = "black", cex = 1.2) | |
| less_extreme_2nd_time <- which(distance1[extreme1] > distance2[extreme1]) | |
| length(less_extreme_2nd_time) | |
| # WHAT IS THE DISTRIBUTION OF OBSERVED RESULT THE FIRST TIME? | |
| quantile(distance1) | |
| # WHICH UNIT(s) SCORED HIGHEST THE **FIRST** TIME? | |
| which(distance1 == max(distance1)) | |
| # WHAT WERE THE SCORES OF THESE BEST PERFORMERS? | |
| distance1[which(distance1 == max(distance1))] | |
| # HOW FAR FROM THE TRUE MEAN RESULT WERE THESE UNITS THE **SECOND** TIME? | |
| distance2[which(distance1 == max(distance1))] | |
| ########## | |
| QUESTIONS: | |
| (1) Would you expect the histograms of "distance1" and "distance2" | |
| to be similar or the same? Plot them and seek.connection( | |
| (2) The code above shows that regression to the mean occurs | |
| for the best performers on test 1. Does it also occur for the worst performers? | |
| (3) Try it for the top 25% and bottom 25% of performers. | |
| To do so, modify the code--e.g., "which(distance1 > XXX)"--get XXX from quantile() | |
| (4) IF YOU HAVE TIME: Randomly distribute non-zero "true abilities" throughout | |
| the sample--e.g., round(runif(100, -10, 10)) --- see if this makes a difference |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment