Created
September 28, 2025 00:26
-
-
Save boooeee/38d14a8d02589790656a12595d10b3a5 to your computer and use it in GitHub Desktop.
high school volleyball simple ranking system
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Volleyball Team Simple Rating System (SRS) | |
| # | |
| # This script scrapes high school volleyball game data from SI.com and calculates | |
| # team rankings using a Simple Rating System methodology based on set margins and strength of schedule. | |
| # | |
| # Author: Mike Beuoy | |
| # GitHub: https://gist.github.com/boooeee | |
| # | |
| # SETUP INSTRUCTIONS: | |
| # 1. Find your team's SI.com ID by visiting https://www.si.com/high-school/stats/, search for your team, navigate to "Teams", and then select the desired volleyball team. The team id will be part of the url. | |
| # 2. Update the YOUR_TEAM_ID variable below with your team's ID | |
| # 3. Install required packages if not already installed: | |
| # install.packages(c("dplyr", "rvest", "tidyr", "stringr", "gt")) | |
| # 4. Run the script! | |
| # Clear environment | |
| rm(list = ls(all = TRUE)) | |
| # Load required packages | |
| suppressPackageStartupMessages({ | |
| library(dplyr) # Data manipulation | |
| library(rvest) # Web scraping | |
| library(tidyr) # Data tidying | |
| library(stringr) # String manipulation | |
| library(gt) # Table formatting | |
| }) | |
| # ============================================================================= | |
| # CONFIGURATION - MODIFY THESE VALUES FOR YOUR TEAM | |
| # ============================================================================= | |
| # Your team's SI.com ID (find this in your team's SI.com URL) | |
| # Example: https://www.si.com/high-school/stats/california/volleyball/teams/448026/games | |
| # The team ID would be 448026 | |
| YOUR_TEAM_ID <- 448022 # <-- CHANGE THIS TO YOUR TEAM'S ID | |
| # Analysis parameters (usually don't need to change these) | |
| MAX_NETWORK_TEAMS <- 50 # Maximum teams to include in competitive network | |
| MAX_ITERATIONS <- 5 # Maximum iterations for network expansion | |
| RETRY_ATTEMPTS <- 5 # Number of retry attempts for failed web requests | |
| RETRY_DELAY <- 5 # Seconds to wait between retry attempts | |
| LEAGUE_SOURCE <- "standings" # enter "derived" to derive league teams from schedule, enter "standings" to use the league teams indicated in the standings page | |
| ONLY_BEST_OF_5 <- TRUE # Set to TRUE to only include games where winning team won 3 sets | |
| # ============================================================================= | |
| # HELPER FUNCTIONS | |
| # ============================================================================= | |
| #' Safe web scraping function with retry logic | |
| #' @param url URL to scrape | |
| #' @param max_retries Maximum number of retry attempts | |
| #' @param sleep_time Seconds to wait between retries | |
| #' @return HTML content or NULL if all attempts fail | |
| safe_pull <- function(url, max_retries = RETRY_ATTEMPTS, sleep_time = RETRY_DELAY) { | |
| for (i in 1:max_retries) { | |
| result <- tryCatch( | |
| read_html(url), | |
| error = function(e) { | |
| if (i == max_retries) { | |
| warning(paste("Failed to retrieve:", url, "after", max_retries, "attempts")) | |
| } | |
| return(NULL) | |
| } | |
| ) | |
| if (!is.null(result)) { | |
| return(result) | |
| } | |
| if (i < max_retries) { | |
| cat(" Retry", i, "failed, waiting", sleep_time, "seconds...\n") | |
| Sys.sleep(sleep_time) | |
| } | |
| } | |
| return(NULL) | |
| } | |
| #' Extract table from SI.com page with retry logic | |
| #' @param url URL of the SI.com team games page | |
| #' @param max_retries Maximum number of retry attempts | |
| #' @param sleep_time Seconds to wait between retries | |
| #' @return HTML table or NULL if extraction fails | |
| extract_si_table <- function(url, max_retries = RETRY_ATTEMPTS, sleep_time = RETRY_DELAY, class_selector = "Games-module") { | |
| for (attempt in 1:max_retries) { | |
| raw_html <- tryCatch( | |
| readChar(url, nchars = 1e8), | |
| error = function(e) NULL | |
| ) | |
| if (!is.null(raw_html)) { | |
| # Look for table with specified module in the class | |
| games_module_pattern <- paste0('<table[^>]*class="[^"]*',class_selector,'[^"]*"[^>]*>') | |
| games_module_match <- str_locate(raw_html, games_module_pattern) | |
| if (!is.na(games_module_match[1])) { | |
| # Find the start of the <table> tag for this Games-module table | |
| table_start <- str_locate(raw_html, "<table")[1] | |
| table_end <- str_locate(raw_html, "</table>")[2] | |
| } else { | |
| # Fallback to generic table search | |
| table_start <- str_locate(raw_html, "<table")[1] | |
| table_end <- str_locate(raw_html, "</table>")[2] | |
| } | |
| if (!is.na(table_start) && !is.na(table_end)) { | |
| result <- tryCatch( | |
| read_html(substr(raw_html, table_start, table_end)), | |
| error = function(e) NULL | |
| ) | |
| if (!is.null(result)) { | |
| return(result) | |
| } | |
| } | |
| } | |
| # If we get here, the attempt failed | |
| if (attempt < max_retries) { | |
| cat(" Retry", attempt, "of", max_retries, "failed, waiting", sleep_time, "seconds...\n") | |
| Sys.sleep(sleep_time) | |
| } else { | |
| warning(paste("Failed to extract table from:", url, "after", max_retries, "attempts")) | |
| } | |
| } | |
| return(NULL) | |
| } | |
| # ============================================================================= | |
| # MAIN ANALYSIS | |
| # ============================================================================= | |
| cat("๐ Volleyball Simple Rating System Analysis\n") | |
| cat("==========================================\n\n") | |
| # Constants | |
| BASE_URL <- "https://www.si.com/" | |
| TEAM_BASE_URL <- "https://www.si.com/high-school/stats/california/volleyball/teams/" | |
| # Step 1: Get initial team schedule and identify league teams | |
| cat("Step 1: Analyzing your team's schedule...\n") | |
| your_team_url <- paste0(TEAM_BASE_URL, YOUR_TEAM_ID, "/games") | |
| team_table <- extract_si_table(your_team_url) | |
| if (is.null(team_table)) { | |
| stop("โ Could not retrieve data for your team. Please check:\n", | |
| " 1. Your team ID is correct\n", | |
| " 2. Your internet connection is working\n", | |
| " 3. SI.com is accessible") | |
| } | |
| # Extract game information from your team's schedule | |
| team_urls <- team_table %>% | |
| html_elements(xpath = "//table//td[2]//a") %>% | |
| html_attr(name = "href") | |
| team_names <- team_table %>% | |
| html_elements(xpath = "//table//td[2]//a") %>% | |
| html_text() | |
| results <- team_table %>% | |
| html_elements(xpath = "//table//td[3]//a/span") %>% | |
| html_text() %>% | |
| str_squish() | |
| game_types <- team_table %>% | |
| html_elements(xpath = "//table//td[5]") %>% | |
| html_text() %>% | |
| str_squish() | |
| # Create initial schedule dataframe | |
| initial_schedule <- tibble( | |
| team_url = team_urls, | |
| team_name = team_names, | |
| result = results, | |
| game_type = game_types | |
| ) | |
| cat(" โ Found", nrow(initial_schedule), "games on your team's schedule\n") | |
| # Step 2: Identify all teams in the league | |
| cat("Step 2: Identifying league teams...\n") | |
| league_teams <- initial_schedule %>% | |
| filter(game_type == "League") %>% | |
| mutate(team_id = str_extract(team_url, "(?<=teams/)[0-9]+(?=-)")) %>% | |
| pull(team_id) %>% | |
| unique() | |
| cat(" โ Found", length(league_teams), "initial league opponents\n") | |
| # Expand league team list by checking each league team's opponents | |
| teams_list <- vector('list') | |
| for (team_id in league_teams) { | |
| team_url <- paste0(TEAM_BASE_URL, team_id, "/games") | |
| team_table <- extract_si_table(team_url) | |
| if (is.null(team_table)) next | |
| opponent_urls <- team_table %>% | |
| html_elements(xpath = "//table//td[2]//a") %>% | |
| html_attr(name = "href") | |
| opponent_game_types <- team_table %>% | |
| html_elements(xpath = "//table//td[5]") %>% | |
| html_text() %>% | |
| str_squish() | |
| # Extract league opponents only | |
| league_opponent_urls <- opponent_urls[opponent_game_types == "League"] | |
| league_opponent_ids <- str_extract(league_opponent_urls, "(?<=teams/)[0-9]+(?=-)") | |
| teams_list[[as.character(team_id)]] <- league_opponent_ids | |
| } | |
| # Combine all league teams | |
| teams_list <- unlist(teams_list) | |
| teams_league <- unique(c(YOUR_TEAM_ID, league_teams, teams_list)) | |
| # alternatively, pull league teams from standings page | |
| league_url<- paste0(TEAM_BASE_URL, team_id, "/standings") | |
| league_table <- extract_si_table(league_url,class_selector = "StandingsTable") | |
| team_urls <- league_table %>% | |
| html_elements(xpath = "//table//td[2]//a") %>% | |
| html_attr(name = "href") | |
| team_ids <- str_extract(team_urls, "(?<=teams/)[0-9]+(?=-)") | |
| if (LEAGUE_SOURCE=="standings") { | |
| teams_league<-team_ids | |
| } | |
| cat(" โ Final league size:", length(teams_league), "teams\n") | |
| # Step 3: Build competitive network by iteratively finding opponents | |
| cat("Step 3: Building competitive network...\n") | |
| # Start with teams your team has played + your own team | |
| network_team_urls <- initial_schedule %>% | |
| filter(result %in% c("W", "L")) %>% | |
| pull(team_url) | |
| # Add your own team to the network | |
| network_team_urls <- unique(c(paste0("/high-school/stats/california/volleyball/teams/", YOUR_TEAM_ID), | |
| network_team_urls)) | |
| # Iteratively expand the network | |
| for (iteration in 1:MAX_ITERATIONS) { | |
| cat(" Network expansion iteration", iteration, "- Current size:", length(network_team_urls), "\n") | |
| new_urls <- c() | |
| for (team_url in network_team_urls) { | |
| cat(" Processing:", team_url, "\n") | |
| flush.console() | |
| full_url <- paste0(BASE_URL, team_url, "/games") | |
| team_table <- extract_si_table(full_url) | |
| if (is.null(team_table)) next | |
| opponent_urls <- team_table %>% | |
| html_elements(xpath = "//table//td[2]//a") %>% | |
| html_attr(name = "href") | |
| new_urls <- c(new_urls, opponent_urls) | |
| } | |
| network_team_urls <- unique(c(network_team_urls, new_urls)) | |
| # Stop if network becomes too large | |
| if (length(network_team_urls) > MAX_NETWORK_TEAMS) { | |
| cat(" โ ๏ธ Network size limit (", MAX_NETWORK_TEAMS, ") reached. Stopping expansion.\n") | |
| break | |
| } | |
| } | |
| # Extract team IDs from network | |
| network_team_ids <- str_extract(network_team_urls, "(?<=teams/)[0-9]+(?=-)") | |
| network_team_ids <- unique(c(YOUR_TEAM_ID, network_team_ids)) | |
| # Remove any NA values that might have been extracted | |
| network_team_ids <- network_team_ids[!is.na(network_team_ids)] | |
| cat(" โ Final competitive network:", length(network_team_ids), "teams\n") | |
| # Step 4: Collect all games for teams in the network | |
| cat("Step 4: Collecting game data for all network teams...\n") | |
| all_games <- list() | |
| for (team_id in network_team_ids) { | |
| # Skip NA team IDs | |
| if (is.na(team_id)) { | |
| cat(" โ ๏ธ Skipping invalid team ID (NA)\n") | |
| next | |
| } | |
| cat(" Processing team ID:", team_id, "\n") | |
| flush.console() | |
| team_url <- paste0(TEAM_BASE_URL, team_id, "/games") | |
| team_table <- extract_si_table(team_url) | |
| if (is.null(team_table)) { | |
| cat(" โ ๏ธ Could not retrieve data for team", team_id, "after", RETRY_ATTEMPTS, "attempts\n") | |
| next | |
| } | |
| # Extract game details | |
| opponent_urls <- team_table %>% | |
| html_elements(xpath = "//table//td[2]//a") %>% | |
| html_attr(name = "href") | |
| opponent_ids <- str_extract(opponent_urls, "(?<=teams/)[0-9]+(?=-)") | |
| opponent_names <- team_table %>% | |
| html_elements(xpath = "//table//td[2]//a") %>% | |
| html_text() | |
| game_types <- team_table %>% | |
| html_elements(xpath = "//table//td[5]") %>% | |
| html_text() %>% | |
| str_squish() | |
| set_scores <- team_table %>% | |
| html_elements(xpath = "//table//td[3]//a") %>% | |
| html_text() %>% | |
| str_squish() | |
| game_dates <- team_table %>% | |
| html_elements(xpath = "//table//td[1]/span") %>% | |
| html_text() %>% | |
| str_squish() | |
| # Create games dataframe for this team | |
| team_games <- tibble( | |
| game_date = game_dates, | |
| team_id = team_id, | |
| opp_team_id = opponent_ids, | |
| opp_team_name = opponent_names, | |
| game_type = game_types, | |
| set_score = set_scores | |
| ) | |
| # Parse set scores (format: "W 3-1" or "L 1-3") | |
| team_games <- team_games %>% | |
| separate(set_score, into = c("result", "sets_won", "opp_sets_won"), sep = "[- ]") | |
| all_games[[team_id]] <- team_games | |
| } | |
| cat(" โ Successfully collected data from", length(all_games), "teams\n") | |
| # Step 5: Prepare data for SRS calculation | |
| cat("Step 5: Preparing data for Simple Rating System calculation...\n") | |
| games_tibble <- bind_rows(all_games) %>% | |
| # Filter for valid completed games (volleyball is best of 5, first to 3 wins) | |
| filter(!is.na(sets_won), !is.na(opp_sets_won)) %>% | |
| filter((sets_won == 3 | opp_sets_won == 3) | !ONLY_BEST_OF_5) %>% | |
| filter(team_id %in% network_team_ids, opp_team_id %in% network_team_ids) %>% | |
| # Convert to numeric and calculate set margin | |
| mutate( | |
| sets_won = as.numeric(as.character(sets_won)), | |
| opp_sets_won = as.numeric(as.character(opp_sets_won)), | |
| set_margin = sets_won - opp_sets_won | |
| ) | |
| cat(" โ Processed", nrow(games_tibble), "valid games\n") | |
| # Create team name mapping | |
| team_name_map <- games_tibble %>% | |
| group_by(opp_team_id, opp_team_name) %>% | |
| summarise(games_count = n(), .groups = "drop") %>% | |
| arrange(opp_team_id, desc(games_count)) %>% | |
| group_by(opp_team_id) %>% | |
| slice_head(n = 1) %>% | |
| select(team_id = opp_team_id, team_name = opp_team_name) | |
| # Step 6: Calculate Simple Rating System (SRS) rankings | |
| cat("Step 6: Calculating SRS rankings...\n") | |
| # Prepare SRS data (each game from both perspectives) | |
| srs <- games_tibble %>% | |
| select(team_id, opp_team_id, set_margin) %>% | |
| distinct() | |
| # Add reverse games (opponent's perspective) | |
| srs_reverse <- srs %>% | |
| rename(team_id = opp_team_id, opp_team_id = team_id) %>% | |
| mutate(set_margin = -set_margin) | |
| srs <- bind_rows(srs, srs_reverse) | |
| # Convert to factors for model matrix creation | |
| factor_levels <- unique(c(srs$team_id, srs$opp_team_id)) | |
| srs <- srs %>% | |
| mutate( | |
| team_id = factor(team_id), | |
| opp_team_id = factor(opp_team_id) | |
| ) | |
| # Create model matrices for regression | |
| mm1 <- model.matrix(~team_id - 1, data = srs) | |
| mm2 <- model.matrix(~opp_team_id - 1, data = srs) | |
| mm <- mm1 - mm2 | |
| y <- srs$set_margin | |
| # Run SRS regression | |
| srs_lm <- lm(y ~ mm - 1) | |
| coef <- srs_lm$coefficients | |
| # Create rankings dataframe | |
| srs_ranking <- tibble( | |
| team_id = names(coef), | |
| srs = coef | |
| ) %>% | |
| mutate(team_id = str_remove(team_id, "mmteam_id")) %>% | |
| left_join(team_name_map, by = "team_id") | |
| cat(" โ Calculated SRS ratings for", nrow(srs_ranking), "teams\n") | |
| # Step 7: Generate league rankings | |
| cat("Step 7: Generating final league rankings...\n") | |
| league_ranking <- srs_ranking %>% | |
| filter(team_id %in% teams_league) %>% | |
| mutate(srs = ifelse(is.na(srs), 0, srs)) %>% | |
| # Normalize ratings (mean = 0) | |
| mutate(srs = srs - mean(srs)) %>% | |
| arrange(desc(srs)) %>% | |
| mutate(srs_rank = rank(-srs)) | |
| cat(" โ Generated rankings for", nrow(league_ranking), "league teams\n") | |
| # Step 8: Create and display results | |
| cat("Step 8: Creating results table...\n") | |
| # Highlight your team | |
| league_ranking <- league_ranking %>% | |
| mutate( | |
| is_your_team = team_id == YOUR_TEAM_ID, | |
| srs_formatted = round(srs, 2) | |
| ) | |
| # Create beautiful table with gt (if available) | |
| if (requireNamespace("gt", quietly = TRUE)) { | |
| rankings_table <- league_ranking %>% | |
| select(srs_rank, team_name, srs_formatted, is_your_team) %>% | |
| gt() %>% | |
| cols_hide(columns = is_your_team) %>% | |
| cols_label( | |
| srs_rank = "Rank", | |
| team_name = "Team", | |
| srs_formatted = "SRS Rating" | |
| ) %>% | |
| tab_header( | |
| title = "League Volleyball Rankings", | |
| subtitle = "Simple Rating System (SRS) based on set margin performance and strength of schedule" | |
| ) %>% | |
| tab_style( | |
| style = list( | |
| cell_fill(color = "lightblue"), | |
| cell_text(weight = "bold") | |
| ), | |
| locations = cells_body( | |
| rows = is_your_team == TRUE | |
| ) | |
| ) %>% | |
| tab_style( | |
| style = cell_text(align = "center"), | |
| locations = cells_body(columns = c(srs_rank, srs_formatted)) | |
| ) %>% | |
| tab_style( | |
| style = cell_text(align = "center"), | |
| locations = cells_column_labels(columns = c(srs_rank, srs_formatted)) | |
| ) %>% | |
| tab_style( | |
| style = cell_text(align = "left"), | |
| locations = cells_column_labels(columns = team_name) | |
| ) %>% | |
| tab_footnote( | |
| footnote = "Higher SRS rating indicates stronger performance. Rating normalized to league average of 0.", | |
| locations = cells_column_labels(columns = srs_formatted) | |
| ) %>% | |
| tab_footnote( | |
| footnote = "Your team is highlighted in blue.", | |
| locations = cells_title() | |
| ) %>% | |
| tab_options( | |
| table.font.size = 12, | |
| heading.title.font.size = 16, | |
| heading.subtitle.font.size = 12 | |
| ) | |
| # Display the formatted table | |
| print(rankings_table) | |
| } else { | |
| # Fallback to simple table if gt is not available | |
| cat("\n๐ LEAGUE RANKINGS (Simple Rating System)\n") | |
| cat("==========================================\n") | |
| for (i in 1:nrow(league_ranking)) { | |
| row <- league_ranking[i, ] | |
| marker <- if (row$is_your_team) "๐ " else " " | |
| cat(sprintf("%s%2d. %-30s %+6.2f\n", | |
| marker, row$srs_rank, row$team_name, row$srs_formatted)) | |
| } | |
| cat("\nNote: Higher SRS rating = stronger team performance\n") | |
| cat("Rating is normalized so league average = 0.00\n") | |
| } | |
| # Step 9: Summary and optional file output | |
| cat("\n๐ฏ ANALYSIS SUMMARY\n") | |
| cat("==================\n") | |
| cat("Your team ID:", YOUR_TEAM_ID, "\n") | |
| your_team_data <- league_ranking[league_ranking$is_your_team, ] | |
| if (nrow(your_team_data) > 0) { | |
| cat("Your team rank:", your_team_data$srs_rank, "out of", nrow(league_ranking), "\n") | |
| cat("Your team SRS rating:", sprintf("%+.2f", your_team_data$srs_formatted), "\n") | |
| } else { | |
| cat("โ ๏ธ Your team was not found in the league rankings.\n") | |
| } | |
| cat("Total teams in network:", nrow(srs_ranking), "\n") | |
| cat("League teams ranked:", nrow(league_ranking), "\n") | |
| # Optional: Save results to CSV | |
| output_file <- "volleyball_league_rankings.csv" | |
| write.csv(league_ranking %>% select(-is_your_team), output_file, row.names = FALSE) | |
| cat("๐พ Results saved to:", output_file, "\n") | |
| cat("\nโ Analysis complete!\n") | |
| # ============================================================================= | |
| # ABOUT THE SIMPLE RATING SYSTEM (SRS) | |
| # ============================================================================= | |
| # | |
| # The Simple Rating System calculates team strength based on: | |
| # 1. Margin of Victory: How many sets you win/lose by in each game | |
| # 2. Strength of Schedule: The quality of opponents you face | |
| # 3. Network Effect: Performance against common opponents | |
| # | |
| # The system works by solving a system of linear equations where each team's | |
| # rating equals their average margin of victory plus the average rating of | |
| # their opponents. This creates ratings that account for both performance | |
| # and strength of schedule. | |
| # | |
| # Ratings are normalized so the league average equals 0. Positive ratings | |
| # indicate above-average performance, while negative ratings indicate | |
| # below-average performance. | |
| # | |
| # | |
| # ============================================================================= |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment