Last active
January 20, 2024 21:12
-
-
Save diamonaj/d01328f2aa6377fbb2838c8d9d51f786 to your computer and use it in GitHub Desktop.
CS130 LP 130 (Regression)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
training <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSUROPfTOZfUEpf6Ebby-vta5zWCwt9KK-KAwSvpToGQjQSKdhYsUfoHxYxvbOYxW8_IQxBD9FqWFJg/pub?gid=383144413&single=true&output=csv") | |
head(training) | |
# plot the data with big green dots | |
plot(training$x, training$y, main = "Training Data", pch = 16, cex = 3, col = "green") | |
################################################ | |
#### RUN 3 DIFFERENT MODELS ON THE TRAINING SET | |
# first model, BIG SPAN = 1 | |
fit_bigspan <- loess(y ~ x, data = training, | |
span = 1, | |
degree = 1) # fit the regression | |
# TWEAK the SPAN (SMALL SPAN) = 0.2 | |
fit_smallspan <- loess(y ~ x, data = training, | |
span = 0.2, degree = 1) # | |
# What about a linear regression model. | |
# How to fit this training data with a linear regression? | |
reg1 <- lm(y ~ x, data = training) | |
reg1 | |
# please use the 'data = training' format | |
# do not use the training$x , training$y format | |
##### OBTAIN TRAINING SET RMSEs FOR THE THREE MODELS | |
# to get the predicted ys for the training set, | |
# using loess model with span = 1... | |
predicted_ys_training_bigspan <- predict(fit_bigspan) | |
# to get the predicted ys for the training set, | |
# using loess model with span = 0.2... | |
predicted_ys_training_smallspan <- predict(fit_smallspan) | |
# to get the predicted ys for the training set | |
# using the linear regression model (lm): | |
predicted_ys_training_lm <- predict(reg1, training) | |
cat("\nbig span, loess training set, RMSE =", | |
sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n") | |
cat("\nsmall span, loess, training set, RMSE =", | |
sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n") | |
cat("\nlinear model, training set, RMSE =", | |
sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n") | |
############################################ | |
###### OBTAIN RMSEs FOR THE TEST SET RESULTS | |
###### USING 3 MODELS ABOVE THAT WERE RAINED ON TRAINING SET | |
test <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSnxeyJZa8zlij6jD4i0NMHjuJH_SY3bPO293PvSsqneki7fG2f_I6L3KL0QC831U4NSSyuXh8iFV2F/pub?gid=725957927&single=true&output=csv") | |
## let's remove test set observations outside scope of training set | |
removed1 <- which(test$x > max(training$x)) | |
removed2 <- which(test$x < min(training$x)) | |
to_be_removed <- c(removed1, removed2) | |
test <- test[-c(to_be_removed),] | |
head(test) | |
dim(test) | |
# to get the predicted ys for the TEST set | |
predicted_ys_test_bigspan <- | |
predict(fit_bigspan, newdata = test) | |
predicted_ys_test_smallspan <- | |
predict(fit_smallspan, newdata = test) | |
#predicted_ys_test_bigspan | |
mse_bigspan_test <- mean((test$y - predicted_ys_test_bigspan)^2) | |
#predicted_ys_test_smallspan <- predict(fit_smallspan, newdata = test$x) | |
mse_smallspan_test <- mean((test$y - predicted_ys_test_smallspan)^2) | |
predicted_ys_test_lm <- predict(reg1, newdata = test) | |
mse_test_linear_regression <- mean((test$y - predicted_ys_test_lm)^2) | |
############################## | |
## PRINT SUMMARY OUTPUT... | |
cat("\nbig span loess, test set, RMSE = ", sqrt(mse_bigspan_test), "\n") | |
cat("\nsmall span loess, test set, RMSE = ", sqrt(mse_smallspan_test), "\n") | |
cat("\nlinear model, test set, RMSE =", sqrt(mse_test_linear_regression), "\n") | |
cat("\nbig span, loess training set, RMSE =", | |
sqrt(mean((predicted_ys_training_bigspan - training$y)^2)), "\n") | |
cat("\nsmall span, loess, training set, RMSE =", | |
sqrt(mean((predicted_ys_training_smallspan - training$y)^2)), "\n") | |
cat("\nlinear model, training set, RMSE =", | |
sqrt(mean((predicted_ys_training_lm - training$y)^2)), "\n") | |
### TRAINING SET RESULTS ### | |
# big span loess RMSE = 6.2 | |
# small span loess RMSE = 1.8 | |
# linear model RMSE = 7.4 | |
### TEST SET RESULTS ### | |
# big span loess RMSE = 6.9 | |
# small span loess RMSE = 4.5 | |
# linear model RMSE = 8.7 | |
### CONCLUSIONS: | |
# (a) test set results validated training set's signal that small-span was best | |
# - small span generalizes the best to new data | |
# - even so, small span overfits (compre 1.8 and 4.5) | |
# - would be cool to know actual (DGP) irreducible error: sd(error). < 4.5? | |
# (b) big span loess & linear model underfit (biased & 'rigid' low var models) | |
# (c) but big span and loess both ALSO overfit: compare 6.2 < 6.9, & 7.4 < 8.7 | |
# (d) so it's possible to both underfit & overfit(!!!) see the two links below: | |
# - https://stats.stackexchange.com/questions/488434/can-overfitting-and-underfitting-occur-simultaneously | |
# - https://www.quora.com/Is-it-possible-for-a-Machine-Learning-model-to-simultaneously-overfit-and-underfit-the-training-data | |
## THE END ## |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment