Last active
August 29, 2015 14:08
-
-
Save ccagrawal/28f5b06f0578f5357df4 to your computer and use it in GitHub Desktop.
NBA Homecourt Advantage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Quantifies homecourt advantage in each regular season | |
library(RCurl) | |
library(XML) | |
# Get Basketball Reference regular season schedule with home margin of victory for each game | |
GetSchedule <- function(year) { | |
url <- paste("http://www.basketball-reference.com/leagues/NBA_", year, "_games.html", sep = "") | |
tables <- readHTMLTable(url) | |
schedule <- tables[['games']] | |
schedule <- schedule[, c(1, 3, 4, 5, 6)] | |
schedule$Date <- strptime(schedule$Date, format="%a, %b %d, %Y") | |
schedule[, 3] <- as.numeric(as.character(schedule[, 3])) | |
schedule[, 5] <- as.numeric(as.character(schedule[, 5])) | |
colnames(schedule) <- c("date", "awayName", "awayPoints", "homeName", "homePoints") | |
schedule$homeMargin <- schedule$homePoints - schedule$awayPoints | |
schedule <- schedule[, c(1, 2, 4, 6)] | |
schedule$year <- year | |
return(schedule) | |
} | |
# Calculate rolling mean, SD, and confidence interval (rolling over # seasons) | |
CalcStats <- function(fullSchedule, n, startYear, endYear, confidence) { | |
# Create data frame to store values | |
results <- as.data.frame(matrix(data = 0, nrow = (endYear - startYear - n + 2), ncol = 6)) | |
colnames(results) <- c('year', 'games', 'mean', 'sd', 'lb', 'ub') | |
results$year <- seq(from = (startYear + n - 1), to = endYear) | |
# Calculate mean, SD, and confidence interval | |
for (i in 1:nrow(results)) { | |
years <- seq(from = (results[i, 'year'] - n + 1), to = results[i, 'year']) | |
schedule <- fullSchedule[fullSchedule$year %in% years, ] | |
results[i, 'games'] <- nrow(schedule) | |
results[i, 'mean'] <- mean(schedule$homeMargin) | |
results[i, 'sd'] <- sd(schedule$homeMargin) | |
criticalT <- qt(1 - (1 - confidence) / 2, results[i, 'games'] - 1) | |
results[i, 'lb'] <- results[i, 'mean'] - criticalT * results[i, 'sd'] | |
results[i, 'ub'] <- results[i, 'mean'] + criticalT * results[i, 'sd'] | |
} | |
return(results) | |
} | |
# BBall Ref has schedules from 1950 - 2014 | |
startYear <- 1950 | |
endYear <- 2014 | |
# Download all schedules and merge them into 1 df | |
fullSchedule <- data.frame() | |
for (i in startYear:endYear) { | |
fullSchedule <- rbind(fullSchedule, GetSchedule(i)) | |
cat(i, '\n') | |
} | |
results <- CalcStats(fullSchedule, 5, startYear, endYear, .95) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment