cimentadaj · November 6, 2016 15:25
diff --git a/arm_ex_12.4.R b/arm_ex_12.4.R
 # Posterior predictive checking: continuing the previous exercise, use the fitted
 # model from Exercise 12.2(b) to simulate a new dataset of CD4 percentages
 # (with the same sample size and ages of the original dataset) for the final time
 # point of the study, and record the average CD4 percentage in this sample.
 # Repeat this process 1000 times and compare the simulated distribution to the
 # observed CD4 percentage at the final time point for the actual data.

 # Make the data similar to the model in mod2
 finaltime_data <- subset(cd4, !is.na(treatmnt) & !is.na(baseage))

 # Assign the final date to the date variable
 finaltime_data$VDATE <- as.Date(max(finaltime_data$VDATE, na.rm = T))

 # Only select the variables present in the model
 finaltime_data <- finaltime_data[, -c(1, 4, 5, 6, 8)]

 # Calculate the mean predicted CD4 percentage and it's standard deviation
 mean_cd4 <- mean(predict(mod2, newdata = finaltime_data), na.rm = T)
 sd_cd4 <- sd(predict(mod2, newdata = finaltime_data), na.rm = T)

 # Simulate 1000 observations considering the uncertainty
 # and look at the distribution.
 set.seed(421)
 hist(rnorm(1000, mean_cd4, sd = sd_cd4))

 # It's incredibly wide and it even includes negative numbers,
 # something impossible with percentages. But whats the actual
 # CD4 percentage of that date?
 cd4 <- subset(cd4, !is.na(VDATE))
 cd4[cd4$VDATE == max(cd4$VDATE, na.rm = T), ]

 # And the mean is:
 mean(c(39.0, 21.2))
 # 30.1

 # So our predictions are extremely uncertain. Over half our simulations
 # are underestimating the date and a handful are overestimating
 # the results. It looks like our model is terrible at predicting that final
 # time point.
	# Posterior predictive checking: continuing the previous exercise, use the fitted
	# model from Exercise 12.2(b) to simulate a new dataset of CD4 percentages
	# (with the same sample size and ages of the original dataset) for the final time
	# point of the study, and record the average CD4 percentage in this sample.
	# Repeat this process 1000 times and compare the simulated distribution to the
	# observed CD4 percentage at the final time point for the actual data.

	# Make the data similar to the model in mod2
	finaltime_data <- subset(cd4, !is.na(treatmnt) & !is.na(baseage))

	# Assign the final date to the date variable
	finaltime_data$VDATE <- as.Date(max(finaltime_data$VDATE, na.rm = T))

	# Only select the variables present in the model
	finaltime_data <- finaltime_data[, -c(1, 4, 5, 6, 8)]

	# Calculate the mean predicted CD4 percentage and it's standard deviation
	mean_cd4 <- mean(predict(mod2, newdata = finaltime_data), na.rm = T)
	sd_cd4 <- sd(predict(mod2, newdata = finaltime_data), na.rm = T)

	# Simulate 1000 observations considering the uncertainty
	# and look at the distribution.
	set.seed(421)
	hist(rnorm(1000, mean_cd4, sd = sd_cd4))

	# It's incredibly wide and it even includes negative numbers,
	# something impossible with percentages. But whats the actual
	# CD4 percentage of that date?
	cd4 <- subset(cd4, !is.na(VDATE))
	cd4[cd4$VDATE == max(cd4$VDATE, na.rm = T), ]

	# And the mean is:
	mean(c(39.0, 21.2))
	# 30.1

	# So our predictions are extremely uncertain. Over half our simulations
	# are underestimating the date and a handful are overestimating
	# the results. It looks like our model is terrible at predicting that final
	# time point.