-
-
Save Sandy4321/6fe82bf9b79015353afc to your computer and use it in GitHub Desktop.
nba_2015_random_forest_vars
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("ggthemes") | |
c("randomForest","dplyr","magrittr","ggplot2","ggthemes") -> packages | |
lapply(packages,library, character.only = TRUE) | |
getBREFTeamStatTable <- function(season_end = 2015, table_name = 'team', date = T){ | |
c('rvest','dplyr','pipeR','RCurl', 'XML','reshape2') -> packages | |
lapply(packages, library, character.only = T) | |
'http://www.basketball-reference.com/leagues/' -> base | |
(season_end-1) %>>% paste0("-",season_end) -> season | |
'NBA' -> league | |
table_name %>>% tolower -> table_name | |
'#' %>>% paste0(table_name) -> css_page | |
table_name %>>% paste('stats', sep = "_") -> table | |
css_page %>>% paste0(" a") -> css_id | |
base %>>% paste0(league,'_',season_end,".html") -> url | |
if(table_name == 'standings'){ | |
pipeline({ | |
url | |
html | |
html_table(fill = T) | |
~t | |
}) | |
if(season_end >= 1971){ | |
t[2] %>>% data.frame %>>% tbl_df -> df | |
c('team','wins','losses','win_pct','games_back','pts.g','opp_pts.g','srs') -> names(df) | |
'Eastern' -> df$conference | |
t[3] %>>% data.frame() %>>% tbl_df -> df2 | |
c('team','wins','losses','win_pct','games_back','pts.g','opp_pts.g','srs') -> names(df2) | |
'Western' -> df2$conference | |
rbind_list(df,df2) -> df | |
df$team %>>% (grepl('\\*',.)) -> df$playoff_team | |
df$team %>>% (gsub('\\*','',.)) -> df$team | |
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>% | |
(.$conference_rank) -> conference_rank | |
conference_rank %>>% (gsub('\\)','',.)) %>>% as.numeric -> df$conference_rank | |
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>% | |
(.$team) %>>% Trim -> df$team | |
} else{ | |
t[2] %>>% data.frame %>>% tbl_df -> df | |
c('team','wins','losses','win_pct','games_back','pts.g','opp_pts.g','srs') -> names(df) | |
df$team %>>% (grep('Western Division',.)) %>>% as.numeric -> div | |
'' -> df$conference | |
'Eastern' -> df[1:div,'conference'] | |
'Western' -> df[div:nrow(df),'conference'] | |
if(grep('\\(',df$team) %>>% length > 0){ | |
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>% | |
(.$conference_rank) -> conference_rank | |
conference_rank %>>% (gsub('\\)','',.)) %>>% as.numeric -> df$conference_rank | |
df$team %>>% (colsplit(.,'\\(',c('team','conference_rank'))) %>>% | |
(.$team) %>>% Trim -> df$team | |
} | |
df$team %>>% (grepl('\\*',.)) -> df$playoff_team | |
df$team %>>% (gsub('\\*','',.)) -> df$team | |
} | |
df[df$games_back == '—','games_back'] <- 0 | |
df$games_back %>>% as.numeric -> df$games_back | |
df %>>% | |
filter(team == 'Baltimore Bullets'|!is.na(wins)) -> df | |
df$pts.g - df$opp_pts.g -> df$point_differential | |
pipeline({ | |
url | |
html | |
xpathSApply(path = '//*[(@id = "all_standings")]//a',xmlAttrs) | |
unlist | |
~team_url_stems | |
}) | |
NULL -> names(team_url_stems) | |
team_url_stems %>>% (gsub('.html|\\/teams|\\/','',.)) %>>% | |
(gsub(pattern = "[^[:alpha:]]", replacement = "", .)) -> bref_team_id | |
data.frame(season,table_name = 'standings', bref_team_id, df) -> df | |
'http://www.basketball-reference.com' %>>% paste0(team_url_stems) -> df$bref_team_season_url | |
} else{ | |
url %>>% ## get table | |
html %>>% | |
html_nodes(css_page) %>>% | |
html_table(header = F) %>>% data.frame() %>>% tbl_df() -> df | |
if(df$X1[1] == 'Rk'){ | |
df %>>% | |
filter(X1 == "Rk") %>>% as.character -> names | |
'Rk' %>>% grep(x = df$X1) -> row_of_header #find where rank is | |
(row_of_header + 1) %>>% (df[.:nrow(df),]) -> df #skip that row and go to the end | |
names %>>% tolower-> names(df)} else{ | |
df %>>% | |
filter(X1 == "Rk") %>>% as.character -> names | |
'Rk' %>>% grep(x = df$X1) -> row_of_header #find where rank is | |
(row_of_header + 1) %>>% (df[.:nrow(df),]) -> df #skip that row and go to the end | |
names %>>% tolower-> names(df) | |
} | |
names(df) %>>% (gsub('\\%|/','\\.',.)) -> names(df) | |
NULL -> df$rk | |
c('team','arena') -> table_name_character | |
df[,!(df %>>% names) %in% table_name_character] %>>% apply(2, function(x) gsub('\\,','',x) %>>% | |
as.numeric(x)) -> df[,!(df %>>% names) %in% table_name_character] #get rid of commas and make numeric | |
df$team %>>% grepl(pattern = '\\*') -> df$playoff_team | |
df$team %>>% (gsub('\\*','',.)) -> df$team | |
df %>>% nrow() -1 -> rows | |
df[1:rows,] -> df | |
paste0("//*[(@id = '",table_name,"')]//a") -> xpath | |
##Grab Team Ids | |
url %>>% ## get table | |
html %>>% | |
xpathSApply(xpath,xmlAttrs) %>>% | |
unlist %>>% as.character -> stems | |
stems %>>% (gsub('\\/|.html|teams','',.)) %>>% | |
(gsub(season_end,'',.)) -> bref_team_id | |
data.frame(season,table_name = table, bref_team_id, df) -> df | |
'http://www.basketball-reference.com' %>>% paste0(stems) -> df$bref_team_season_url | |
} | |
if(date == T){ | |
Sys.time() -> df$scrape_time | |
} | |
return(df) | |
} | |
getBREFTeamStatTable(season_end = 2015) -> stats | |
getBREFTeamStatTable(season_end = 2015, table_name = 'standings') -> wins | |
getBREFTeamStatTable(season_end = 2015, table_name = 'opponent') -> opp | |
getBREFTeamStatTable(season_end = 2015, table_name = 'misc') -> misc | |
misc %>% | |
select(bref_team_id:ft.fga.1,attendance,-pl,-pw) -> misc | |
opp %>% | |
select(team,bref_team_id,fg:pts.g) -> opp | |
names(opp)[3:24] %<>% paste0("opp_",.) | |
wins[,c('team','wins')] %>% | |
left_join(stats) -> nba_team_data | |
nba_team_data %>% left_join(opp) -> nba_team_data | |
misc %>% left_join(nba_team_data) %>% | |
select(-season,-table_name,-playoff_team,-bref_team_season_url,-scrape_time) -> data_nba | |
data_nba %>% | |
select(-team,-bref_team_id, -g,-mp,-mov, -sos, -srs, -pts) -> rf_data | |
formula = as.formula(wins ~ .) | |
library(randomForest) | |
set.seed(Sys.time()) | |
#formula = as.formula(Wins ~ FG + FGA + Three_Point + Three_PointA + FT + FTA + | |
# ORB + DRB + AST + STL + BLK + TOV + PF + PTS + FG_Opp + FGA_Opp + | |
# Three_Point_Opp + Three_PointA_Opp + FT_Opp + FTA_Opp + ORB_Opp + DRB_Opp + | |
# AST_Opp + STL_Opp + BLK_Opp + TOV_Opp + PF_Opp + PTS_Opp + Age) | |
formula %>% | |
randomForest(data=rf_data, mtry=5, ntree=10000, importance=TRUE) -> rf | |
imp = data.frame(importance(rf), check.names=F) | |
c("MSE.Increase") -> names(imp)[1] | |
#row.names(imp) -> imp$name | |
row.names(imp) -> imp$column_name | |
imp %>% tbl_df -> imp | |
### Set Correct Names | |
#"~/Desktop/Github/asb_shiny_apps/nba/explorer/v3/data/advanced_player_stat_table_names.csv" %>% | |
# read.csv -> adv_name | |
#"~/Desktop/Github/asb_shiny_apps/nba/explorer/v3/data/per_game_player_stat_table_names.csv" %>% | |
# read.csv -> stats_names | |
#rbind(stats_names,adv_name) %>% unique -> table.names | |
#merge(table.names, imp, all.x = T, all.y = T) %>% | |
# filter(!is.na(MSE.Increase)) -> imp | |
#write.csv(imp,'imp.csv') | |
#c('Attendance','Opponent Effective Field Goal %','FT to FGA','Opponent FT to FGA','Opponent Assists','Opponent Blocks','Opponent Defensive Rebounds','Opponent Field Goals','Opponent Field Goal %','Opponent Field Goal Attempts','Opponent Free Throws','Opponent Free Throw %','Opponent Free Throw Attempts',,,,,,, | |
# , ->imp[2:60,] -> imp | |
#'imp.csv' %>% read.csv %>% tbl_df -> imp | |
imp %>% | |
ggplot(aes(x=reorder(column_name,MSE.Increase), y=MSE.Increase, fill = name)) + | |
geom_bar(stat='identity') + | |
geom_hline(yintercept=abs(min(imp$MSE.Increase)), col=2, linetype='dashed') + | |
coord_flip() + | |
theme_fivethirtyeight() + | |
ylab("MSE Increase") + | |
theme(legend.position="none") + | |
scale_y_continuous(breaks=seq(0,40,2)) + | |
labs(title="NBA Win Contribution Factors via Random Forest, 2014-15 Season",x= "MSE Increase") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment