diamonaj · November 9, 2023 16:12
diff --git a/reg_to_mean.R b/reg_to_mean.R
 set.seed(432)

 # imagine these are scores before a program:
 before <- rnorm(50, mean = 0, sd = 10)

 # imagine these are scores after a program
 after <- before + rnorm(50, mean = 5, sd = 20)

 # the scores are correlated, but not perfectly correlated
 # correlation = 0.32... the program helps, 
 # on average, it helps with +5 points, but there is
 # luck involved in the "after" assessment
 cor(before, after)

 # calculate z-scores for the before scores, to see how far from the mean
 # calculate z-scores for the after scores, to see how extreme
 before_zscore <- (before - mean(before))/sd(before)
 after_zscore <- (after - mean(after))/sd(after)

 # plot the zscored observations (before vs. after)
 plot(x = before_zscore, y = after_zscore, 
     pch = 16, col = "blue",
     xlim = c(-3, 3), ylim = c(-3, 3))

 # run a linear regression (fit a line) to the points in the plot
 # there are technical reasons why this line will NEVER have a 
 # slope of greater than 1 (or less than -1) -- basically b/c
 # the slope of a regression line drawn through zscored (scaled) 
 # observations is always the simple correlation and correlation is
 # never less than -1 or greater than +1.
 reg1 <- lm(after_zscore ~ before_zscore)
 abline(reg1, col = "red", lwd = 4)

 # you can see that the slope of the line is 0.32, the correlation above,
 # it's the number in the first column, second row
 summary(reg1)

 # just to show you that the correlation is still 0.32
 cor(before_zscore, after_zscore)

 # and this is why whenever you have correlated before/after data you have 
 # regression to the mean--because, for any given value of 'before' (call that
 # value "b", e.g., b = 1), the expected value of the corresponding 'after'
 # value (i.e., the values of 'after' along the regression line)
 # will be less than or equal or |b| (absolute value of b). 
 # The values of 'after' along the regression line will be
 # equal to absolute value of b when correlation is perfect and the slope of
 # the regression line is exactly 1. Otherwise, the slope will be shallower
 # less steep than 1, and the 'after' value will be smaller than |b|.
	set.seed(432)

	# imagine these are scores before a program:
	before <- rnorm(50, mean = 0, sd = 10)

	# imagine these are scores after a program
	after <- before + rnorm(50, mean = 5, sd = 20)

	# the scores are correlated, but not perfectly correlated
	# correlation = 0.32... the program helps,
	# on average, it helps with +5 points, but there is
	# luck involved in the "after" assessment
	cor(before, after)

	# calculate z-scores for the before scores, to see how far from the mean
	# calculate z-scores for the after scores, to see how extreme
	before_zscore <- (before - mean(before))/sd(before)
	after_zscore <- (after - mean(after))/sd(after)

	# plot the zscored observations (before vs. after)
	plot(x = before_zscore, y = after_zscore,
	pch = 16, col = "blue",
	xlim = c(-3, 3), ylim = c(-3, 3))

	# run a linear regression (fit a line) to the points in the plot
	# there are technical reasons why this line will NEVER have a
	# slope of greater than 1 (or less than -1) -- basically b/c
	# the slope of a regression line drawn through zscored (scaled)
	# observations is always the simple correlation and correlation is
	# never less than -1 or greater than +1.
	reg1 <- lm(after_zscore ~ before_zscore)
	abline(reg1, col = "red", lwd = 4)

	# you can see that the slope of the line is 0.32, the correlation above,
	# it's the number in the first column, second row
	summary(reg1)

	# just to show you that the correlation is still 0.32
	cor(before_zscore, after_zscore)

	# and this is why whenever you have correlated before/after data you have
	# regression to the mean--because, for any given value of 'before' (call that
	# value "b", e.g., b = 1), the expected value of the corresponding 'after'
	# value (i.e., the values of 'after' along the regression line)
	# will be less than or equal or \|b\| (absolute value of b).
	# The values of 'after' along the regression line will be
	# equal to absolute value of b when correlation is perfect and the slope of
	# the regression line is exactly 1. Otherwise, the slope will be shallower
	# less steep than 1, and the 'after' value will be smaller than \|b\|.