Created
April 25, 2016 17:20
-
-
Save primaryobjects/8df8eb44a8fbb5d2fe08829d28dd7321 to your computer and use it in GitHub Desktop.
NBA dataset analysis and linear regression prediction.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| NBA <- read.csv('nba_train.csv') | |
| # Seems around 42 wins, the team has a good chance of making it to the playoffs. | |
| table(NBA$W, NBA$Playoffs) | |
| NBA$PTSdiff <- NBA$PTS - NBA$oppPTS | |
| WinsReg <- lm(W ~ PTSdiff, data = NBA) | |
| PointsReg <- lm(PTS ~ X2PA + X3PA + FTA + AST + ORB + DRB + TOV + STL + BLK, data=NBA) | |
| SSE <- sum(PointsReg$residuals^2) | |
| # Average error is 184 points. | |
| RMSE <- sqrt(SSE / nrow(NBA)) | |
| # Average points for a year is 8370, so an error of 184 points isn't so bad. | |
| mean(NBA$PTS) | |
| # Remove TOV because p-value is largest value, so the least statistically significant. | |
| PointsReg2 <- lm(PTS ~ X2PA + X3PA + FTA + AST + ORB + DRB + STL + BLK, data=NBA) | |
| # Remove DRB because highest p-value. Notice R-squared stays the same, so we're justified in removing the feature. | |
| PointsReg3 <- lm(PTS ~ X2PA + X3PA + FTA + AST + ORB + STL + BLK, data=NBA) | |
| # Remove BLK. R-squared stays the same, looking good. | |
| PointsReg4 <- lm(PTS ~ X2PA + X3PA + FTA + AST + ORB + STL, data=NBA) | |
| SSE4 <- sum(PointsReg4$residuals^2) | |
| # Average error is 184.5, about the same as first model, but simpler (less features). | |
| RMSE4 <- sqrt(SSE4 / nrow(NBA)) | |
| NBA_test <- read.csv('NBA_test.csv') | |
| # Predict how many points in 2012-2013 season. | |
| PointsPredictions <- predict(PointsReg4, newdata=NBA_test) | |
| SSE_test <- sum((PointsPredictions - NBA_test$PTS)^2) | |
| SST <- sum((mean(NBA$PTS) - NBA_test$PTS)^2) | |
| # R-squared is 0.81 | |
| R2 <- 1 - SSE_test/SST | |
| # Root mean squared error on the test set is 196, close to the training set. | |
| RMSE_test <- sqrt(SSE_test / nrow(NBA_test)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment