diamonaj · November 15, 2016 15:42
diff --git a/Advanced FA Regression to the Mean b/Advanced FA Regression to the Mean
 ### REGRESSION TO THE MEAN

 set.seed(12345)

 dev.off()

 ### Generate "TRUE ABILITY"

 # N people, all with "zero" ability
 N <- 20
 true_ability <- rep(0, N)

 # their luck on two tests varies within set bounds
 upper_bound_luck <- 25
 lower_bound_luck <- -upper_bound_luck # in this default case, minus 25

 # luck on the first test (average luck is 0)
 luck1 <- round(runif(N, lower_bound_luck, upper_bound_luck))

 # luck on the second test (average luck is 0)
 luck2 <- round(runif(N, lower_bound_luck, upper_bound_luck))

 # TEST RESULTS
 observed_result1 <- true_ability + luck1
 mean_result1 <- mean(observed_result1)

 observed_result2 <- true_ability + luck2
 mean_result2 <- mean(observed_result2)

 # TO PLOT THE DATA
 plot(x = (1:N), y = observed_result1, col = "blue", pch = 19, cex = 0.75, 
     xlab = "Person #", ylab = "Test 1 Result")

 points(x=1:N, y = observed_result2, col = "red", pch = 23, cex = 0.75)

 # horizontal dotted line at true average
 abline(h = 0, lty = 2)

 # HOW FAR FROM TRUE MEAN RESULT (zero--zero true ability & zero luck)
 distance1 <- abs(observed_result1)
 distance2 <- abs(observed_result2)

 # Did units with extreme values of distance 1 have less extreme values
 # for distance 2?
 what_is_extreme <- 18
 extreme1 <- which(distance1 >= what_is_extreme)
 length(extreme1)

 points(x=c(1:100)[extreme1], y = observed_result1[extreme1], col = "black", cex = 1.2)

 less_extreme_2nd_time <- which(distance1[extreme1] > distance2[extreme1])
 length(less_extreme_2nd_time)

 # WHAT IS THE DISTRIBUTION OF OBSERVED RESULT THE FIRST TIME?
 quantile(distance1)

 # WHICH UNIT(s) SCORED HIGHEST THE **FIRST** TIME?
 which(distance1 == max(distance1))

 # WHAT WERE THE SCORES OF THESE BEST PERFORMERS?
 distance1[which(distance1 == max(distance1))]

 # HOW FAR FROM THE TRUE MEAN RESULT WERE THESE UNITS THE **SECOND** TIME?
 distance2[which(distance1 == max(distance1))]


 ##########
 QUESTIONS:
  
  (1) Would you expect the histograms of "distance1" and "distance2" 
 to be similar or the same?  Plot them and seek.connection(
  
  (2) The code above shows that regression to the mean occurs 
  for the best performers on test 1.  Does it also occur for the worst performers?
  
  (3) Try it for the top 25% and bottom 25% of performers.  
  To do so, modify the code--e.g., "which(distance1 > XXX)"--get XXX from quantile()
  
  (4) IF YOU HAVE TIME: Randomly distribute non-zero "true abilities" throughout
  the sample--e.g., round(runif(100, -10, 10)) --- see if this makes a difference
	### REGRESSION TO THE MEAN

	set.seed(12345)

	dev.off()

	### Generate "TRUE ABILITY"

	# N people, all with "zero" ability
	N <- 20
	true_ability <- rep(0, N)

	# their luck on two tests varies within set bounds
	upper_bound_luck <- 25
	lower_bound_luck <- -upper_bound_luck # in this default case, minus 25

	# luck on the first test (average luck is 0)
	luck1 <- round(runif(N, lower_bound_luck, upper_bound_luck))

	# luck on the second test (average luck is 0)
	luck2 <- round(runif(N, lower_bound_luck, upper_bound_luck))

	# TEST RESULTS
	observed_result1 <- true_ability + luck1
	mean_result1 <- mean(observed_result1)

	observed_result2 <- true_ability + luck2
	mean_result2 <- mean(observed_result2)

	# TO PLOT THE DATA
	plot(x = (1:N), y = observed_result1, col = "blue", pch = 19, cex = 0.75,
	xlab = "Person #", ylab = "Test 1 Result")

	points(x=1:N, y = observed_result2, col = "red", pch = 23, cex = 0.75)

	# horizontal dotted line at true average
	abline(h = 0, lty = 2)

	# HOW FAR FROM TRUE MEAN RESULT (zero--zero true ability & zero luck)
	distance1 <- abs(observed_result1)
	distance2 <- abs(observed_result2)

	# Did units with extreme values of distance 1 have less extreme values
	# for distance 2?
	what_is_extreme <- 18
	extreme1 <- which(distance1 >= what_is_extreme)
	length(extreme1)

	points(x=c(1:100)[extreme1], y = observed_result1[extreme1], col = "black", cex = 1.2)

	less_extreme_2nd_time <- which(distance1[extreme1] > distance2[extreme1])
	length(less_extreme_2nd_time)

	# WHAT IS THE DISTRIBUTION OF OBSERVED RESULT THE FIRST TIME?
	quantile(distance1)

	# WHICH UNIT(s) SCORED HIGHEST THE FIRST TIME?
	which(distance1 == max(distance1))

	# WHAT WERE THE SCORES OF THESE BEST PERFORMERS?
	distance1[which(distance1 == max(distance1))]

	# HOW FAR FROM THE TRUE MEAN RESULT WERE THESE UNITS THE SECOND TIME?
	distance2[which(distance1 == max(distance1))]


	##########
	QUESTIONS:

	(1) Would you expect the histograms of "distance1" and "distance2"
	to be similar or the same? Plot them and seek.connection(

	(2) The code above shows that regression to the mean occurs
	for the best performers on test 1. Does it also occur for the worst performers?

	(3) Try it for the top 25% and bottom 25% of performers.
	To do so, modify the code--e.g., "which(distance1 > XXX)"--get XXX from quantile()

	(4) IF YOU HAVE TIME: Randomly distribute non-zero "true abilities" throughout
	the sample--e.g., round(runif(100, -10, 10)) --- see if this makes a difference
No results found