diamonaj · January 20, 2024 21:12
diff --git a/loess_lm.R b/loess_lm.R
 rm(list=ls())
 training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")

 head(training)

 # plot the data with big green dots
 plot(training$x, training$y, main = "Training Data", pch = 16, cex = 3, col = "green")

 ################################################
 #### RUN 3 DIFFERENT MODELS ON THE TRAINING SET

 # first model, BIG SPAN = 1
 fit_bigspan <- loess(y ~ x, data = training,
                     span = 1, 
                     degree = 1) # fit the regression

 # TWEAK the SPAN (SMALL SPAN) = 0.2
 fit_smallspan <- loess(y ~ x, data = training,
                       span = 0.2, degree = 1) # 

 # What about a linear regression model.
 # How to fit this training data with a linear regression?

 reg1 <- lm(y ~ x,  data = training)
 reg1
 # please use the 'data = training' format
 # do not use the training$x , training$y format


 ##### OBTAIN TRAINING SET RMSEs FOR THE THREE MODELS
 # to get the predicted ys for the training set,
 # using loess model with span = 1...
 predicted_ys_training_bigspan <- predict(fit_bigspan)

 # to get the predicted ys for the training set,
 # using loess model with span = 0.2...
 predicted_ys_training_smallspan <- predict(fit_smallspan)

 # to get the predicted ys for the training set
 # using the linear regression model (lm):
 predicted_ys_training_lm <- predict(reg1, training)

 cat("\nbig span, loess training set, RMSE =", 
    sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
 cat("\nsmall span, loess, training set, RMSE =", 
    sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
 cat("\nlinear model, training set, RMSE =", 
    sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")


 ############################################
 ###### OBTAIN RMSEs FOR THE TEST SET RESULTS 
 ###### USING 3 MODELS ABOVE THAT WERE RAINED ON TRAINING SET

 test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

 ## let's remove test set observations outside scope of training set
 removed1 <- which(test$x > max(training$x))
 removed2 <- which(test$x < min(training$x))
 to_be_removed <- c(removed1, removed2)
 test <- test[-c(to_be_removed),]


 head(test)
 dim(test)

 # to get the predicted ys for the TEST set
 predicted_ys_test_bigspan <- 
  predict(fit_bigspan, newdata = test)

 predicted_ys_test_smallspan <- 
  predict(fit_smallspan, newdata = test)

 #predicted_ys_test_bigspan

 mse_bigspan_test <- mean((test$y - predicted_ys_test_bigspan)^2)

 #predicted_ys_test_smallspan <- predict(fit_smallspan, newdata = test$x)
 mse_smallspan_test <- mean((test$y - predicted_ys_test_smallspan)^2)

 predicted_ys_test_lm <- predict(reg1, newdata = test)
 mse_test_linear_regression <- mean((test$y - predicted_ys_test_lm)^2)


 ##############################
 ## PRINT SUMMARY OUTPUT...

 cat("\nbig span loess, test set, RMSE = ", sqrt(mse_bigspan_test), "\n")
 cat("\nsmall span loess, test set, RMSE = ", sqrt(mse_smallspan_test), "\n")
 cat("\nlinear model, test set, RMSE =", sqrt(mse_test_linear_regression), "\n")

 cat("\nbig span, loess training set, RMSE =", 
    sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
 cat("\nsmall span, loess, training set, RMSE =", 
    sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
 cat("\nlinear model, training set, RMSE =", 
    sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")

 ### TRAINING SET RESULTS ###
 # big span loess   RMSE = 6.2 
 # small span loess RMSE = 1.8 
 # linear model     RMSE = 7.4 

 ### TEST SET RESULTS ###
 # big span loess   RMSE = 6.9 
 # small span loess RMSE = 4.5 
 # linear model     RMSE = 8.7

 ### CONCLUSIONS:
 # (a) test set results validated training set's signal that small-span was best
 #     - small span generalizes the best to new data
 #     - even so, small span overfits (compre 1.8 and 4.5)
 #     - would be cool to know actual (DGP) irreducible error: sd(error). < 4.5?
 # (b) big span loess & linear model underfit (biased & 'rigid' low var models)
 # (c) but big span and loess both ALSO overfit: compare 6.2 < 6.9, & 7.4 < 8.7
 # (d) so it's possible to both underfit & overfit(!!!) see the two links below:
 #     - https://stats.stackexchange.com/questions/488434/can-overfitting-and-underfitting-occur-simultaneously
 #     - https://www.quora.com/Is-it-possible-for-a-Machine-Learning-model-to-simultaneously-overfit-and-underfit-the-training-data

 ## THE END ##
	rm(list=ls())
	training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv")

	head(training)

	# plot the data with big green dots
	plot(training$x, training$y, main = "Training Data", pch = 16, cex = 3, col = "green")

	################################################
	#### RUN 3 DIFFERENT MODELS ON THE TRAINING SET

	# first model, BIG SPAN = 1
	fit_bigspan <- loess(y ~ x, data = training,
	span = 1,
	degree = 1) # fit the regression

	# TWEAK the SPAN (SMALL SPAN) = 0.2
	fit_smallspan <- loess(y ~ x, data = training,
	span = 0.2, degree = 1) #

	# What about a linear regression model.
	# How to fit this training data with a linear regression?

	reg1 <- lm(y ~ x, data = training)
	reg1
	# please use the 'data = training' format
	# do not use the training$x , training$y format


	##### OBTAIN TRAINING SET RMSEs FOR THE THREE MODELS
	# to get the predicted ys for the training set,
	# using loess model with span = 1...
	predicted_ys_training_bigspan <- predict(fit_bigspan)

	# to get the predicted ys for the training set,
	# using loess model with span = 0.2...
	predicted_ys_training_smallspan <- predict(fit_smallspan)

	# to get the predicted ys for the training set
	# using the linear regression model (lm):
	predicted_ys_training_lm <- predict(reg1, training)

	cat("\nbig span, loess training set, RMSE =",
	sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
	cat("\nsmall span, loess, training set, RMSE =",
	sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
	cat("\nlinear model, training set, RMSE =",
	sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")


	############################################
	###### OBTAIN RMSEs FOR THE TEST SET RESULTS
	###### USING 3 MODELS ABOVE THAT WERE RAINED ON TRAINING SET

	test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv")

	## let's remove test set observations outside scope of training set
	removed1 <- which(test$x > max(training$x))
	removed2 <- which(test$x < min(training$x))
	to_be_removed <- c(removed1, removed2)
	test <- test[-c(to_be_removed),]


	head(test)
	dim(test)

	# to get the predicted ys for the TEST set
	predicted_ys_test_bigspan <-
	predict(fit_bigspan, newdata = test)

	predicted_ys_test_smallspan <-
	predict(fit_smallspan, newdata = test)

	#predicted_ys_test_bigspan

	mse_bigspan_test <- mean((test$y - predicted_ys_test_bigspan)^2)

	#predicted_ys_test_smallspan <- predict(fit_smallspan, newdata = test$x)
	mse_smallspan_test <- mean((test$y - predicted_ys_test_smallspan)^2)

	predicted_ys_test_lm <- predict(reg1, newdata = test)
	mse_test_linear_regression <- mean((test$y - predicted_ys_test_lm)^2)


	##############################
	## PRINT SUMMARY OUTPUT...

	cat("\nbig span loess, test set, RMSE = ", sqrt(mse_bigspan_test), "\n")
	cat("\nsmall span loess, test set, RMSE = ", sqrt(mse_smallspan_test), "\n")
	cat("\nlinear model, test set, RMSE =", sqrt(mse_test_linear_regression), "\n")

	cat("\nbig span, loess training set, RMSE =",
	sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n")
	cat("\nsmall span, loess, training set, RMSE =",
	sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n")
	cat("\nlinear model, training set, RMSE =",
	sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n")

	### TRAINING SET RESULTS ###
	# big span loess RMSE = 6.2
	# small span loess RMSE = 1.8
	# linear model RMSE = 7.4

	### TEST SET RESULTS ###
	# big span loess RMSE = 6.9
	# small span loess RMSE = 4.5
	# linear model RMSE = 8.7

	### CONCLUSIONS:
	# (a) test set results validated training set's signal that small-span was best
	# - small span generalizes the best to new data
	# - even so, small span overfits (compre 1.8 and 4.5)
	# - would be cool to know actual (DGP) irreducible error: sd(error). < 4.5?
	# (b) big span loess & linear model underfit (biased & 'rigid' low var models)
	# (c) but big span and loess both ALSO overfit: compare 6.2 < 6.9, & 7.4 < 8.7
	# (d) so it's possible to both underfit & overfit(!!!) see the two links below:
	# - https://stats.stackexchange.com/questions/488434/can-overfitting-and-underfitting-occur-simultaneously
	# - https://www.quora.com/Is-it-possible-for-a-Machine-Learning-model-to-simultaneously-overfit-and-underfit-the-training-data

	## THE END ##