diamonaj · November 14, 2016 20:44
diff --git a/CS112 11.1 b/CS112 11.1
 ### RESULTS FOR CS112 LESSON PLAN 11.1 Version 2

 rm(list=ls())
 set.seed(13345)

 ### TO CREATE THE DATA (LATER SEPARATED INTO TEST AND TRAINING SETS)
 x <- round(runif(100, -4, 3), 2)
 y = x^3 + 2*x^2 - 5*x - 3
 y <- jitter(y, 3000) # to add some random noise (i.e., the epsilon in the regression equation)

 ### VISUALIZE THE DATA
 plot(x,y)

 ### TO IDENTIFY THE ROWS IN THE DATA THAT WILL BECOME THE TRAINING SET
 training <- sample(x = c(1:100), size = 50, replace = FALSE)

 ### TO CREATE THE TRAINING SET
 x_train <- x[training]
 y_train <- y[training]

 ### TO CREATE THE TEST SET
 x_test <- x[-training]
 y_test <- y[-training]

 ### TO PUT THE TRAINING DATA IN ORDER (SMALLEST TO BIGGEST, BY THE X-VARIABLE)
 y_train <- y_train[order(x_train)]
 x_train <- x_train[order(x_train)]

 ### TO PREPARE THE TEST SET DATA, WE HAVE TO:
 ### (1) RETAIN ONLY THE TEST SET OBSERVATIONS WITH 
 ###     X < THE MAXIMUM VALUE OF X IN THE TRAINING SET AND
 ###     X > THE MINIMUM VALUE OF X IN THE TRAINING SET,
 ###     BECAUSE LOCAL REGRESSION WILL ONLY PREDICT WITHIN THE HULL OF THE TEST SET
 y_test <- y_test[which(x_test <= max(x_train))]
 x_test <- x_test[which(x_test <= max(x_train))]
 y_test <- y_test[which(x_test > min(x_train))]
 x_test <- x_test[which(x_test > min(x_train))]

 ### (2) PUT TEST SET DATA OBSERVATIONS IN ORDER (SMALLEST TO BIGGEST, BY X-VARIABLE)
 y_test <- y_test[order(x_test)]
 x_test <- x_test[order(x_test)]

 ########## DATA IS READY. TIME FOR ANALYSIS...
 RMSE_in_sample <- c()     # storage vector for training set results
 RMSE_out_of_sample <- c() # storage vector for test set results

 ### LOOPING THROUGH ALL THE SPANS FROM SPAN = 1/20 TO SPAN = 20/20
 for(i in 1:20) {
  fit.out <- loess(y_train ~ x_train, span = i/20, degree = 1) # fit the regression
  RMSE_in_sample[i] <-  sqrt( mean( (y_train - predict(fit.out, x_train))^2 ) )
  RMSE_out_of_sample[i] <- sqrt( mean( (y_test - predict(fit.out, x_test))^2 ) )
 }

 ### CREATE A TABLE SUMMARIZING RESULTS
 results_table <- data.frame(c(1:20/20), RMSE_in_sample, RMSE_out_of_sample)

 ### ESTIMATING AND PLOTTING THE INTERESTING RESULTS
 smallest_RMSE <- which(RMSE_out_of_sample == min(RMSE_out_of_sample))

 cat("\nThe smallest RMSE (which is approximately ", 
    round(RMSE_out_of_sample[smallest_RMSE]), 
    ") is associated with a span of ", 
    smallest_RMSE*(1/20), ".\n\n", sep = "")

 dev.off()
 par(mfrow = c(2,1))
 plot(results_table[,1], RMSE_in_sample, ylim = c(0, max(RMSE_out_of_sample, RMSE_in_sample)), 
     ylab = "RMSE", xlab = "span", main = "Span vs.\ RMSE", xaxp = c(0, 1, 10))

 points(results_table[,1], RMSE_out_of_sample, pch = 19)
 text(x = 0.25, y = RMSE_out_of_sample[4] +1, "OUT-OF-SAMPLE RMSE", cex = 0.7)
 text(x = 0.25, y = RMSE_in_sample[5] - 0.75, "IN-SAMPLE RMSE", cex = 0.7)

 points(results_table[,1][smallest_RMSE], RMSE_out_of_sample[smallest_RMSE], 
       cex = 3, col = "red")
 lines(x = c(results_table[,1][smallest_RMSE], results_table[,1][smallest_RMSE]),
     y = c(0, RMSE_out_of_sample[smallest_RMSE]), col = "red", lty = 2)

 ### BEST OUT OF SAMPLE FIT
 fit.out <- loess(y_train ~ x_train, span = 0.10, degree = 1)  # WE USE TRAINING DATA TO GET THE "FIT.OUT" MODEL

 plot(x_test, y_test, main = "Predicting Test-Set Data",       # AND WE PLOT THE TEST DATA
     xlab = c("Test-Set X Values"), ylab = "Test-Set Y Values")

 lines(x_test, predict(fit.out, x_test), lwd = 5, col = "red") # VIZ THE TRAINING MODEL APPLIED TO TEST DATA

 ### TO SATISFY CURIOUSITY, I VISUALIZE THE IN-SAMPLE TEST-SET FIT WITH SPAN = 0.35, SHOWN IN ORANGE,
 ### TO DEMONSTRATE THAT WE OVERFIT THE TEST-SET WHEN WE BOTH FIT THE MODEL AND APPLY IT IN THE TEST SET
 fit.out2 <- loess(y_test ~ x_test, span = 0.10, degree = 1)
 lines(x_test, predict(fit.out2, x_test), lwd = 5, col = "orange") # add lines to plot
 text(x = -1, y = predict(fit.out, -2.2) + 3 , "BEST OUT-OF-SAMPLE PREDICTION \n span = 0.35")
 text(x = -1.75, y = predict(fit.out2, -2.2) - 5, "OVERFIT IN-SAMPLE", cex = 0.7)
	### RESULTS FOR CS112 LESSON PLAN 11.1 Version 2

	rm(list=ls())
	set.seed(13345)

	### TO CREATE THE DATA (LATER SEPARATED INTO TEST AND TRAINING SETS)
	x <- round(runif(100, -4, 3), 2)
	y = x^3 + 2x^2 - 5x - 3
	y <- jitter(y, 3000) # to add some random noise (i.e., the epsilon in the regression equation)

	### VISUALIZE THE DATA
	plot(x,y)

	### TO IDENTIFY THE ROWS IN THE DATA THAT WILL BECOME THE TRAINING SET
	training <- sample(x = c(1:100), size = 50, replace = FALSE)

	### TO CREATE THE TRAINING SET
	x_train <- x[training]
	y_train <- y[training]

	### TO CREATE THE TEST SET
	x_test <- x[-training]
	y_test <- y[-training]

	### TO PUT THE TRAINING DATA IN ORDER (SMALLEST TO BIGGEST, BY THE X-VARIABLE)
	y_train <- y_train[order(x_train)]
	x_train <- x_train[order(x_train)]

	### TO PREPARE THE TEST SET DATA, WE HAVE TO:
	### (1) RETAIN ONLY THE TEST SET OBSERVATIONS WITH
	### X < THE MAXIMUM VALUE OF X IN THE TRAINING SET AND
	### X > THE MINIMUM VALUE OF X IN THE TRAINING SET,
	### BECAUSE LOCAL REGRESSION WILL ONLY PREDICT WITHIN THE HULL OF THE TEST SET
	y_test <- y_test[which(x_test <= max(x_train))]
	x_test <- x_test[which(x_test <= max(x_train))]
	y_test <- y_test[which(x_test > min(x_train))]
	x_test <- x_test[which(x_test > min(x_train))]

	### (2) PUT TEST SET DATA OBSERVATIONS IN ORDER (SMALLEST TO BIGGEST, BY X-VARIABLE)
	y_test <- y_test[order(x_test)]
	x_test <- x_test[order(x_test)]

	########## DATA IS READY. TIME FOR ANALYSIS...
	RMSE_in_sample <- c() # storage vector for training set results
	RMSE_out_of_sample <- c() # storage vector for test set results

	### LOOPING THROUGH ALL THE SPANS FROM SPAN = 1/20 TO SPAN = 20/20
	for(i in 1:20) {
	fit.out <- loess(y_train ~ x_train, span = i/20, degree = 1) # fit the regression
	RMSE_in_sample[i] <- sqrt( mean( (y_train - predict(fit.out, x_train))^2 ) )
	RMSE_out_of_sample[i] <- sqrt( mean( (y_test - predict(fit.out, x_test))^2 ) )
	}

	### CREATE A TABLE SUMMARIZING RESULTS
	results_table <- data.frame(c(1:20/20), RMSE_in_sample, RMSE_out_of_sample)

	### ESTIMATING AND PLOTTING THE INTERESTING RESULTS
	smallest_RMSE <- which(RMSE_out_of_sample == min(RMSE_out_of_sample))

	cat("\nThe smallest RMSE (which is approximately ",
	round(RMSE_out_of_sample[smallest_RMSE]),
	") is associated with a span of ",
	smallest_RMSE*(1/20), ".\n\n", sep = "")

	dev.off()
	par(mfrow = c(2,1))
	plot(results_table[,1], RMSE_in_sample, ylim = c(0, max(RMSE_out_of_sample, RMSE_in_sample)),
	ylab = "RMSE", xlab = "span", main = "Span vs.\ RMSE", xaxp = c(0, 1, 10))

	points(results_table[,1], RMSE_out_of_sample, pch = 19)
	text(x = 0.25, y = RMSE_out_of_sample[4] +1, "OUT-OF-SAMPLE RMSE", cex = 0.7)
	text(x = 0.25, y = RMSE_in_sample[5] - 0.75, "IN-SAMPLE RMSE", cex = 0.7)

	points(results_table[,1][smallest_RMSE], RMSE_out_of_sample[smallest_RMSE],
	cex = 3, col = "red")
	lines(x = c(results_table[,1][smallest_RMSE], results_table[,1][smallest_RMSE]),
	y = c(0, RMSE_out_of_sample[smallest_RMSE]), col = "red", lty = 2)

	### BEST OUT OF SAMPLE FIT
	fit.out <- loess(y_train ~ x_train, span = 0.10, degree = 1) # WE USE TRAINING DATA TO GET THE "FIT.OUT" MODEL

	plot(x_test, y_test, main = "Predicting Test-Set Data", # AND WE PLOT THE TEST DATA
	xlab = c("Test-Set X Values"), ylab = "Test-Set Y Values")

	lines(x_test, predict(fit.out, x_test), lwd = 5, col = "red") # VIZ THE TRAINING MODEL APPLIED TO TEST DATA

	### TO SATISFY CURIOUSITY, I VISUALIZE THE IN-SAMPLE TEST-SET FIT WITH SPAN = 0.35, SHOWN IN ORANGE,
	### TO DEMONSTRATE THAT WE OVERFIT THE TEST-SET WHEN WE BOTH FIT THE MODEL AND APPLY IT IN THE TEST SET
	fit.out2 <- loess(y_test ~ x_test, span = 0.10, degree = 1)
	lines(x_test, predict(fit.out2, x_test), lwd = 5, col = "orange") # add lines to plot
	text(x = -1, y = predict(fit.out, -2.2) + 3 , "BEST OUT-OF-SAMPLE PREDICTION \n span = 0.35")
	text(x = -1.75, y = predict(fit.out2, -2.2) - 5, "OVERFIT IN-SAMPLE", cex = 0.7)
No results found