Skip to content

Instantly share code, notes, and snippets.

@boooeee
Created September 28, 2025 00:26
Show Gist options
  • Save boooeee/38d14a8d02589790656a12595d10b3a5 to your computer and use it in GitHub Desktop.
Save boooeee/38d14a8d02589790656a12595d10b3a5 to your computer and use it in GitHub Desktop.
high school volleyball simple ranking system
# Volleyball Team Simple Rating System (SRS)
#
# This script scrapes high school volleyball game data from SI.com and calculates
# team rankings using a Simple Rating System methodology based on set margins and strength of schedule.
#
# Author: Mike Beuoy
# GitHub: https://gist.github.com/boooeee
#
# SETUP INSTRUCTIONS:
# 1. Find your team's SI.com ID by visiting https://www.si.com/high-school/stats/, search for your team, navigate to "Teams", and then select the desired volleyball team. The team id will be part of the url.
# 2. Update the YOUR_TEAM_ID variable below with your team's ID
# 3. Install required packages if not already installed:
# install.packages(c("dplyr", "rvest", "tidyr", "stringr", "gt"))
# 4. Run the script!
# Clear environment
rm(list = ls(all = TRUE))
# Load required packages
suppressPackageStartupMessages({
library(dplyr) # Data manipulation
library(rvest) # Web scraping
library(tidyr) # Data tidying
library(stringr) # String manipulation
library(gt) # Table formatting
})
# =============================================================================
# CONFIGURATION - MODIFY THESE VALUES FOR YOUR TEAM
# =============================================================================
# Your team's SI.com ID (find this in your team's SI.com URL)
# Example: https://www.si.com/high-school/stats/california/volleyball/teams/448026/games
# The team ID would be 448026
YOUR_TEAM_ID <- 448022 # <-- CHANGE THIS TO YOUR TEAM'S ID
# Analysis parameters (usually don't need to change these)
MAX_NETWORK_TEAMS <- 50 # Maximum teams to include in competitive network
MAX_ITERATIONS <- 5 # Maximum iterations for network expansion
RETRY_ATTEMPTS <- 5 # Number of retry attempts for failed web requests
RETRY_DELAY <- 5 # Seconds to wait between retry attempts
LEAGUE_SOURCE <- "standings" # enter "derived" to derive league teams from schedule, enter "standings" to use the league teams indicated in the standings page
ONLY_BEST_OF_5 <- TRUE # Set to TRUE to only include games where winning team won 3 sets
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================
#' Safe web scraping function with retry logic
#' @param url URL to scrape
#' @param max_retries Maximum number of retry attempts
#' @param sleep_time Seconds to wait between retries
#' @return HTML content or NULL if all attempts fail
safe_pull <- function(url, max_retries = RETRY_ATTEMPTS, sleep_time = RETRY_DELAY) {
for (i in 1:max_retries) {
result <- tryCatch(
read_html(url),
error = function(e) {
if (i == max_retries) {
warning(paste("Failed to retrieve:", url, "after", max_retries, "attempts"))
}
return(NULL)
}
)
if (!is.null(result)) {
return(result)
}
if (i < max_retries) {
cat(" Retry", i, "failed, waiting", sleep_time, "seconds...\n")
Sys.sleep(sleep_time)
}
}
return(NULL)
}
#' Extract table from SI.com page with retry logic
#' @param url URL of the SI.com team games page
#' @param max_retries Maximum number of retry attempts
#' @param sleep_time Seconds to wait between retries
#' @return HTML table or NULL if extraction fails
extract_si_table <- function(url, max_retries = RETRY_ATTEMPTS, sleep_time = RETRY_DELAY, class_selector = "Games-module") {
for (attempt in 1:max_retries) {
raw_html <- tryCatch(
readChar(url, nchars = 1e8),
error = function(e) NULL
)
if (!is.null(raw_html)) {
# Look for table with specified module in the class
games_module_pattern <- paste0('<table[^>]*class="[^"]*',class_selector,'[^"]*"[^>]*>')
games_module_match <- str_locate(raw_html, games_module_pattern)
if (!is.na(games_module_match[1])) {
# Find the start of the <table> tag for this Games-module table
table_start <- str_locate(raw_html, "<table")[1]
table_end <- str_locate(raw_html, "</table>")[2]
} else {
# Fallback to generic table search
table_start <- str_locate(raw_html, "<table")[1]
table_end <- str_locate(raw_html, "</table>")[2]
}
if (!is.na(table_start) && !is.na(table_end)) {
result <- tryCatch(
read_html(substr(raw_html, table_start, table_end)),
error = function(e) NULL
)
if (!is.null(result)) {
return(result)
}
}
}
# If we get here, the attempt failed
if (attempt < max_retries) {
cat(" Retry", attempt, "of", max_retries, "failed, waiting", sleep_time, "seconds...\n")
Sys.sleep(sleep_time)
} else {
warning(paste("Failed to extract table from:", url, "after", max_retries, "attempts"))
}
}
return(NULL)
}
# =============================================================================
# MAIN ANALYSIS
# =============================================================================
cat("๐Ÿ Volleyball Simple Rating System Analysis\n")
cat("==========================================\n\n")
# Constants
BASE_URL <- "https://www.si.com/"
TEAM_BASE_URL <- "https://www.si.com/high-school/stats/california/volleyball/teams/"
# Step 1: Get initial team schedule and identify league teams
cat("Step 1: Analyzing your team's schedule...\n")
your_team_url <- paste0(TEAM_BASE_URL, YOUR_TEAM_ID, "/games")
team_table <- extract_si_table(your_team_url)
if (is.null(team_table)) {
stop("โŒ Could not retrieve data for your team. Please check:\n",
" 1. Your team ID is correct\n",
" 2. Your internet connection is working\n",
" 3. SI.com is accessible")
}
# Extract game information from your team's schedule
team_urls <- team_table %>%
html_elements(xpath = "//table//td[2]//a") %>%
html_attr(name = "href")
team_names <- team_table %>%
html_elements(xpath = "//table//td[2]//a") %>%
html_text()
results <- team_table %>%
html_elements(xpath = "//table//td[3]//a/span") %>%
html_text() %>%
str_squish()
game_types <- team_table %>%
html_elements(xpath = "//table//td[5]") %>%
html_text() %>%
str_squish()
# Create initial schedule dataframe
initial_schedule <- tibble(
team_url = team_urls,
team_name = team_names,
result = results,
game_type = game_types
)
cat(" โœ“ Found", nrow(initial_schedule), "games on your team's schedule\n")
# Step 2: Identify all teams in the league
cat("Step 2: Identifying league teams...\n")
league_teams <- initial_schedule %>%
filter(game_type == "League") %>%
mutate(team_id = str_extract(team_url, "(?<=teams/)[0-9]+(?=-)")) %>%
pull(team_id) %>%
unique()
cat(" โœ“ Found", length(league_teams), "initial league opponents\n")
# Expand league team list by checking each league team's opponents
teams_list <- vector('list')
for (team_id in league_teams) {
team_url <- paste0(TEAM_BASE_URL, team_id, "/games")
team_table <- extract_si_table(team_url)
if (is.null(team_table)) next
opponent_urls <- team_table %>%
html_elements(xpath = "//table//td[2]//a") %>%
html_attr(name = "href")
opponent_game_types <- team_table %>%
html_elements(xpath = "//table//td[5]") %>%
html_text() %>%
str_squish()
# Extract league opponents only
league_opponent_urls <- opponent_urls[opponent_game_types == "League"]
league_opponent_ids <- str_extract(league_opponent_urls, "(?<=teams/)[0-9]+(?=-)")
teams_list[[as.character(team_id)]] <- league_opponent_ids
}
# Combine all league teams
teams_list <- unlist(teams_list)
teams_league <- unique(c(YOUR_TEAM_ID, league_teams, teams_list))
# alternatively, pull league teams from standings page
league_url<- paste0(TEAM_BASE_URL, team_id, "/standings")
league_table <- extract_si_table(league_url,class_selector = "StandingsTable")
team_urls <- league_table %>%
html_elements(xpath = "//table//td[2]//a") %>%
html_attr(name = "href")
team_ids <- str_extract(team_urls, "(?<=teams/)[0-9]+(?=-)")
if (LEAGUE_SOURCE=="standings") {
teams_league<-team_ids
}
cat(" โœ“ Final league size:", length(teams_league), "teams\n")
# Step 3: Build competitive network by iteratively finding opponents
cat("Step 3: Building competitive network...\n")
# Start with teams your team has played + your own team
network_team_urls <- initial_schedule %>%
filter(result %in% c("W", "L")) %>%
pull(team_url)
# Add your own team to the network
network_team_urls <- unique(c(paste0("/high-school/stats/california/volleyball/teams/", YOUR_TEAM_ID),
network_team_urls))
# Iteratively expand the network
for (iteration in 1:MAX_ITERATIONS) {
cat(" Network expansion iteration", iteration, "- Current size:", length(network_team_urls), "\n")
new_urls <- c()
for (team_url in network_team_urls) {
cat(" Processing:", team_url, "\n")
flush.console()
full_url <- paste0(BASE_URL, team_url, "/games")
team_table <- extract_si_table(full_url)
if (is.null(team_table)) next
opponent_urls <- team_table %>%
html_elements(xpath = "//table//td[2]//a") %>%
html_attr(name = "href")
new_urls <- c(new_urls, opponent_urls)
}
network_team_urls <- unique(c(network_team_urls, new_urls))
# Stop if network becomes too large
if (length(network_team_urls) > MAX_NETWORK_TEAMS) {
cat(" โš ๏ธ Network size limit (", MAX_NETWORK_TEAMS, ") reached. Stopping expansion.\n")
break
}
}
# Extract team IDs from network
network_team_ids <- str_extract(network_team_urls, "(?<=teams/)[0-9]+(?=-)")
network_team_ids <- unique(c(YOUR_TEAM_ID, network_team_ids))
# Remove any NA values that might have been extracted
network_team_ids <- network_team_ids[!is.na(network_team_ids)]
cat(" โœ“ Final competitive network:", length(network_team_ids), "teams\n")
# Step 4: Collect all games for teams in the network
cat("Step 4: Collecting game data for all network teams...\n")
all_games <- list()
for (team_id in network_team_ids) {
# Skip NA team IDs
if (is.na(team_id)) {
cat(" โš ๏ธ Skipping invalid team ID (NA)\n")
next
}
cat(" Processing team ID:", team_id, "\n")
flush.console()
team_url <- paste0(TEAM_BASE_URL, team_id, "/games")
team_table <- extract_si_table(team_url)
if (is.null(team_table)) {
cat(" โš ๏ธ Could not retrieve data for team", team_id, "after", RETRY_ATTEMPTS, "attempts\n")
next
}
# Extract game details
opponent_urls <- team_table %>%
html_elements(xpath = "//table//td[2]//a") %>%
html_attr(name = "href")
opponent_ids <- str_extract(opponent_urls, "(?<=teams/)[0-9]+(?=-)")
opponent_names <- team_table %>%
html_elements(xpath = "//table//td[2]//a") %>%
html_text()
game_types <- team_table %>%
html_elements(xpath = "//table//td[5]") %>%
html_text() %>%
str_squish()
set_scores <- team_table %>%
html_elements(xpath = "//table//td[3]//a") %>%
html_text() %>%
str_squish()
game_dates <- team_table %>%
html_elements(xpath = "//table//td[1]/span") %>%
html_text() %>%
str_squish()
# Create games dataframe for this team
team_games <- tibble(
game_date = game_dates,
team_id = team_id,
opp_team_id = opponent_ids,
opp_team_name = opponent_names,
game_type = game_types,
set_score = set_scores
)
# Parse set scores (format: "W 3-1" or "L 1-3")
team_games <- team_games %>%
separate(set_score, into = c("result", "sets_won", "opp_sets_won"), sep = "[- ]")
all_games[[team_id]] <- team_games
}
cat(" โœ“ Successfully collected data from", length(all_games), "teams\n")
# Step 5: Prepare data for SRS calculation
cat("Step 5: Preparing data for Simple Rating System calculation...\n")
games_tibble <- bind_rows(all_games) %>%
# Filter for valid completed games (volleyball is best of 5, first to 3 wins)
filter(!is.na(sets_won), !is.na(opp_sets_won)) %>%
filter((sets_won == 3 | opp_sets_won == 3) | !ONLY_BEST_OF_5) %>%
filter(team_id %in% network_team_ids, opp_team_id %in% network_team_ids) %>%
# Convert to numeric and calculate set margin
mutate(
sets_won = as.numeric(as.character(sets_won)),
opp_sets_won = as.numeric(as.character(opp_sets_won)),
set_margin = sets_won - opp_sets_won
)
cat(" โœ“ Processed", nrow(games_tibble), "valid games\n")
# Create team name mapping
team_name_map <- games_tibble %>%
group_by(opp_team_id, opp_team_name) %>%
summarise(games_count = n(), .groups = "drop") %>%
arrange(opp_team_id, desc(games_count)) %>%
group_by(opp_team_id) %>%
slice_head(n = 1) %>%
select(team_id = opp_team_id, team_name = opp_team_name)
# Step 6: Calculate Simple Rating System (SRS) rankings
cat("Step 6: Calculating SRS rankings...\n")
# Prepare SRS data (each game from both perspectives)
srs <- games_tibble %>%
select(team_id, opp_team_id, set_margin) %>%
distinct()
# Add reverse games (opponent's perspective)
srs_reverse <- srs %>%
rename(team_id = opp_team_id, opp_team_id = team_id) %>%
mutate(set_margin = -set_margin)
srs <- bind_rows(srs, srs_reverse)
# Convert to factors for model matrix creation
factor_levels <- unique(c(srs$team_id, srs$opp_team_id))
srs <- srs %>%
mutate(
team_id = factor(team_id),
opp_team_id = factor(opp_team_id)
)
# Create model matrices for regression
mm1 <- model.matrix(~team_id - 1, data = srs)
mm2 <- model.matrix(~opp_team_id - 1, data = srs)
mm <- mm1 - mm2
y <- srs$set_margin
# Run SRS regression
srs_lm <- lm(y ~ mm - 1)
coef <- srs_lm$coefficients
# Create rankings dataframe
srs_ranking <- tibble(
team_id = names(coef),
srs = coef
) %>%
mutate(team_id = str_remove(team_id, "mmteam_id")) %>%
left_join(team_name_map, by = "team_id")
cat(" โœ“ Calculated SRS ratings for", nrow(srs_ranking), "teams\n")
# Step 7: Generate league rankings
cat("Step 7: Generating final league rankings...\n")
league_ranking <- srs_ranking %>%
filter(team_id %in% teams_league) %>%
mutate(srs = ifelse(is.na(srs), 0, srs)) %>%
# Normalize ratings (mean = 0)
mutate(srs = srs - mean(srs)) %>%
arrange(desc(srs)) %>%
mutate(srs_rank = rank(-srs))
cat(" โœ“ Generated rankings for", nrow(league_ranking), "league teams\n")
# Step 8: Create and display results
cat("Step 8: Creating results table...\n")
# Highlight your team
league_ranking <- league_ranking %>%
mutate(
is_your_team = team_id == YOUR_TEAM_ID,
srs_formatted = round(srs, 2)
)
# Create beautiful table with gt (if available)
if (requireNamespace("gt", quietly = TRUE)) {
rankings_table <- league_ranking %>%
select(srs_rank, team_name, srs_formatted, is_your_team) %>%
gt() %>%
cols_hide(columns = is_your_team) %>%
cols_label(
srs_rank = "Rank",
team_name = "Team",
srs_formatted = "SRS Rating"
) %>%
tab_header(
title = "League Volleyball Rankings",
subtitle = "Simple Rating System (SRS) based on set margin performance and strength of schedule"
) %>%
tab_style(
style = list(
cell_fill(color = "lightblue"),
cell_text(weight = "bold")
),
locations = cells_body(
rows = is_your_team == TRUE
)
) %>%
tab_style(
style = cell_text(align = "center"),
locations = cells_body(columns = c(srs_rank, srs_formatted))
) %>%
tab_style(
style = cell_text(align = "center"),
locations = cells_column_labels(columns = c(srs_rank, srs_formatted))
) %>%
tab_style(
style = cell_text(align = "left"),
locations = cells_column_labels(columns = team_name)
) %>%
tab_footnote(
footnote = "Higher SRS rating indicates stronger performance. Rating normalized to league average of 0.",
locations = cells_column_labels(columns = srs_formatted)
) %>%
tab_footnote(
footnote = "Your team is highlighted in blue.",
locations = cells_title()
) %>%
tab_options(
table.font.size = 12,
heading.title.font.size = 16,
heading.subtitle.font.size = 12
)
# Display the formatted table
print(rankings_table)
} else {
# Fallback to simple table if gt is not available
cat("\n๐Ÿ“Š LEAGUE RANKINGS (Simple Rating System)\n")
cat("==========================================\n")
for (i in 1:nrow(league_ranking)) {
row <- league_ranking[i, ]
marker <- if (row$is_your_team) "๐Ÿ‘‰ " else " "
cat(sprintf("%s%2d. %-30s %+6.2f\n",
marker, row$srs_rank, row$team_name, row$srs_formatted))
}
cat("\nNote: Higher SRS rating = stronger team performance\n")
cat("Rating is normalized so league average = 0.00\n")
}
# Step 9: Summary and optional file output
cat("\n๐ŸŽฏ ANALYSIS SUMMARY\n")
cat("==================\n")
cat("Your team ID:", YOUR_TEAM_ID, "\n")
your_team_data <- league_ranking[league_ranking$is_your_team, ]
if (nrow(your_team_data) > 0) {
cat("Your team rank:", your_team_data$srs_rank, "out of", nrow(league_ranking), "\n")
cat("Your team SRS rating:", sprintf("%+.2f", your_team_data$srs_formatted), "\n")
} else {
cat("โš ๏ธ Your team was not found in the league rankings.\n")
}
cat("Total teams in network:", nrow(srs_ranking), "\n")
cat("League teams ranked:", nrow(league_ranking), "\n")
# Optional: Save results to CSV
output_file <- "volleyball_league_rankings.csv"
write.csv(league_ranking %>% select(-is_your_team), output_file, row.names = FALSE)
cat("๐Ÿ’พ Results saved to:", output_file, "\n")
cat("\nโœ… Analysis complete!\n")
# =============================================================================
# ABOUT THE SIMPLE RATING SYSTEM (SRS)
# =============================================================================
#
# The Simple Rating System calculates team strength based on:
# 1. Margin of Victory: How many sets you win/lose by in each game
# 2. Strength of Schedule: The quality of opponents you face
# 3. Network Effect: Performance against common opponents
#
# The system works by solving a system of linear equations where each team's
# rating equals their average margin of victory plus the average rating of
# their opponents. This creates ratings that account for both performance
# and strength of schedule.
#
# Ratings are normalized so the league average equals 0. Positive ratings
# indicate above-average performance, while negative ratings indicate
# below-average performance.
#
#
# =============================================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment