Created
September 20, 2019 16:14
-
-
Save viniciusmss/e65f088ef8a4429d0ccda252c48f1dd7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ######### The local regression part begins at line 84 | |
| ### Arbitrarily define some simple data-generating process ("DGP"): | |
| X = c(1:100) | |
| set.seed(9876) # this sets the random seed, ensuring replicability | |
| irreducible_error <- rnorm(100) | |
| # Here's the data-generating process ("DGP") | |
| Y <- 10*X + 500*irreducible_error | |
| data_set <- data.frame(X, Y) | |
| # You could plot it if you wished... | |
| plot(data_set$X, data_set$Y, col = "blue", | |
| pch = 16, # pch = 16 creates a filled circle | |
| cex = 2, # cex = 2 doubles the size of the points | |
| xlab = "Values of X", ylab = "Values of Y", main = "The Data") | |
| ### Take the 100 observations in the data | |
| ### and split the data into a 50 observation training set | |
| ### and a 50 observation test set... | |
| # 1 line of code below, w/ ";", to ensure you run the set.seed line | |
| set.seed(5); training_set_rows <- sample(1:100, 50, replace = FALSE) | |
| # define the training set by selecting only the 50 training set rows | |
| training_set <- data_set[training_set_rows,] | |
| # define the test set by deleting the 50 training set rows | |
| test_set <- data_set[-training_set_rows,] | |
| # you can plot both sets to visually confirm sampling looks random | |
| dev.off() # to erase the previous figure | |
| plot(training_set[,1], training_set[,2], pch = 16, col = "blue", | |
| xlab = "Values of X", ylab = "Values of Y", | |
| main = "Training Set in Blue, Test Set in Orange") | |
| # now add the test set points in purple | |
| points(test_set[,1], test_set[,2], pch = 16, col = "orange") | |
| ### Now--get real. Ordinarily you would only have the Xs and Ys | |
| ### and you would not know the DGP (but you would WANT to know it.) | |
| # You might decide to fit a linear regression. | |
| # If you did fit a linear regression here, do you think you'd be overfitting? | |
| # How could you assess if you were overfitting? | |
| # Run a linear regression on the training set. call it "reg1" | |
| reg1 <- lm(Y ~ X, data = training_set) | |
| summary(reg1) # to observe the regression results--the below should appear | |
| # Call: | |
| # lm(formula = Y ~ X, data = training_set) | |
| # Residuals: | |
| # Min 1Q Median 3Q Max | |
| # -968.8 -348.6 -2.0 287.8 1411.5 | |
| # Coefficients: | |
| # Estimate Std. Error t value Pr(>|t|) | |
| # (Intercept) -172.630 146.342 -1.180 0.244 | |
| # X 14.222 2.584 5.505 1.42e-06 *** | |
| # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 | |
| # Residual standard error: 485.6 on 48 degrees of freedom | |
| # Multiple R-squared: 0.387, Adjusted R-squared: 0.3742 | |
| # F-statistic: 30.3 on 1 and 48 DF, p-value: 1.418e-06 | |
| # (1) What does your regression say "f", the relationship btw X and Y is? | |
| # (2) Is the median residual -2? (residual = 'observed Y' - 'predicted Y') | |
| # Obtain the training set Y values predicted by the regression | |
| predict(reg1, newdata = training_set) | |
| # Calculate the training set error (mean squared error) | |
| #How? | |
| ## Calculate the test set error (mean squared error) | |
| # First use the regression to make "Y" predictions for the test set | |
| predict(reg1, newdata = test_set) | |
| ## Change pseudocode below to code. What do the results imply? Are we overfitting? | |
| mean((training set observed Ys - training set predicted Ys)^2) | |
| mean((test set observed Ys - test set predicted Ys)^2) | |
| ####### | |
| # fit a flexible local regression to the training set | |
| fit1.0 <- loess(Y ~ X, span = 0.2, degree = 1, data = training_set) | |
| training_loess_y_pred <- predict(fit1.0, newdata = training_set) | |
| # Calculate the training set error (mean squared error) | |
| # How? | |
| ## Calculate the test set error (mean squared error) | |
| # First use the regression to make "Y" predictions for the test set | |
| test_loess_y_pred <- predict(fit1.0, newdata = test_set) | |
| mean((training_set$Y - training_loess_y_pred)^2) | |
| # challenge question: why do I have to include na.rm = TRUE in the below? | |
| mean((test_set$Y - test_loess_y_pred)^2, na.rm = TRUE) | |
| # Create an informative plot | |
| plot(training_set$X, training_set$Y, pch = 16, cex = 1.5) | |
| points(training_set$X, training_loess_y_pred, pch = 16, | |
| col = "violet", cex = .75) | |
| # To draw line segments between the predicted points, you have to make sure | |
| # to connect the predicted points in order from left to right... | |
| lines(training_set$X[order(training_set$X)], | |
| training_loess_y_pred[order(training_set$X)], | |
| col = "violet") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment