Created
August 24, 2020 11:51
-
-
Save bdilday/36d9ce508d1aa01d27f6044c317c60ed to your computer and use it in GitHub Desktop.
graphical mlb standings in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(ggplot2) | |
library(ggrepel) | |
library(rvest) | |
library(stringr) | |
fg_current_standings <- function() { | |
url = "https://www.fangraphs.com/depthcharts.aspx?position=Standings" | |
h = xml2::read_html(url) | |
tables = html_table(h)[9:14] | |
div_order = paste(rep(c("AL", "NL"), each=3), c("E", "C", "W"), sep='-') | |
current_standings = lapply(1:6, function(i) { | |
tmp = tables[[i]] | |
names(tmp) = tmp[2,] | |
tmp = tmp[3:nrow(tmp),1:8] # only current standings + team name, not projections | |
tmp$div_id = div_order[[i]] | |
tmp | |
}) %>% dplyr::bind_rows() | |
nms = names(current_standings) | |
nms = str_replace(nms, "%", "pct") | |
nms = str_replace(nms, "/", "_") | |
names(current_standings) = nms | |
for (nm in c("G", "W", "L", "RDif")) { | |
current_standings[[nm]] = as.integer(current_standings[[nm]]) | |
} | |
for (nm in c("Wpct", "RS_G", "RA_G")) { | |
current_standings[[nm]] = as.numeric(current_standings[[nm]]) | |
} | |
current_standings | |
} | |
del_run = 2.75 | |
current_standings = fg_current_standings() | |
rs1 = 0.5 * (mean(current_standings$RS_G) + mean(current_standings$RA_G)) | |
run_max = rs1 + del_run | |
run_min = rs1 - del_run | |
text_x_min = run_min + 2 * del_run * 0.12 | |
text_x_max = run_max - 2 * del_run * 0.12 | |
# 36 to 126 per 162 games in steps of 10 | |
wseq = seq(81-45, 81+45, 10) / 162 | |
slopes = sqrt((1-wseq)/wseq) | |
dfC = data.frame(s=slopes) | |
dfC$x = run_min | |
dfC$y = run_min * slopes | |
dfC$xend = run_max | |
dfC$yend = run_max * slopes | |
cc = which(dfC$y < run_min) | |
dfC[cc,]$y = run_min | |
dfC[cc,]$x = run_min / slopes[cc] | |
cc = which(dfC$yend > run_max) | |
dfC[cc,]$yend = run_max | |
dfC[cc,]$xend = run_max / slopes[cc] | |
wlabs = data.frame(w=wseq*162) | |
wlabs$x = text_x_min | |
wlabs$y = text_x_min * slopes | |
cc = which(wlabs$y < run_min) | |
if (length(cc) > 0) { | |
wlabs[cc,]$x = text_x_max | |
wlabs[cc,]$y = text_x_max * slopes[cc] | |
} | |
p = current_standings %>% | |
ggplot(aes(x=RS_G, y=RA_G)) + geom_point() + | |
geom_text_repel(aes(label=Team)) + | |
theme_minimal(base_size = 16) + | |
xlim(run_min-0.01, run_max+0.01) + ylim(run_max+0.01, run_min-0.01) + | |
labs(x="RS / G", y="RA / G") + | |
geom_segment(data=dfC, aes(x=x, y=y, xend=xend, yend=yend), alpha=0.5) + | |
theme(panel.grid = element_blank()) + | |
geom_vline(xintercept=rs1) + geom_hline(yintercept=rs1) + | |
facet_wrap(~div_id) + geom_text(data=wlabs, aes(x=x, y=y-0.1, label=w), alpha=0.5) | |
# compute the end points for the curve | |
current_standings = current_standings %>% | |
mutate(wpythag = RS_G**2/(RS_G**2 + RA_G**2), | |
x0 = RS_G * sqrt(Wpct/wpythag), | |
y0 = x0 * sqrt((1-Wpct)/Wpct)) | |
# this function generates 100 points along a curve given by the parameters | |
parametric_curve = function(x0, y0, xend, yend) { | |
dx_seq = seq(0, 1, 0.01) | |
lapply(dx_seq, function(dx) { | |
tx = x0 + (xend-x0)*dx | |
ty = sqrt(x0**2 + y0**2 - tx**2) | |
list(x=tx, y=ty) | |
}) %>% bind_rows() | |
} | |
# generate the curves for each team | |
arcs = lapply(1:nrow(current_standings), function(idx) { | |
row = current_standings[idx,] | |
arc = parametric_curve(row$RS_G, row$RA_G, row$x0, row$y0) | |
arc$Team = row$Team | |
arc$div_id = row$div_id | |
arc$lucky = as.integer(row$x0 > row$RS_G) | |
arc | |
}) %>% bind_rows() | |
p = current_standings %>% | |
ggplot(aes(x=RS_G, y=RA_G)) + geom_point() + | |
geom_text_repel(aes(x=x0, y=y0, label=Team, color=as.factor(Wpct > wpythag))) + | |
theme_minimal(base_size = 16) + | |
xlim(run_min-0.01, run_max+0.01) + ylim(run_max+0.01, run_min-0.01) + | |
labs(x="RS / G", y="RA / G", | |
title=sprintf("Graphical MLB standings: %s", Sys.Date())) + | |
geom_segment(data=dfC, aes(x=x, y=y, xend=xend, yend=yend), alpha=0.5) + | |
theme(panel.grid = element_blank()) + | |
geom_vline(xintercept=rs1) + geom_hline(yintercept=rs1) + | |
facet_wrap(~div_id) + geom_text(data=wlabs, | |
aes(x=x, y=y-0.1, label=w), | |
alpha=0.5) + | |
geom_path(data=arcs, aes(x=x, y=y, group=Team)) + | |
scale_color_manual(values=c("steelblue", "red"), guide="none") | |
dot_plot = function() { | |
plot_df = current_standings | |
plot_df$div_id = factor(plot_df, | |
levels = c("AL-E", "AL-C","AL-W","NL-E", "NL-C","NL-W")) | |
plot_df = plot_df %>% | |
mutate(lg = ifelse(grepl('AL', div_id), 'AL', 'NL')) %>% | |
group_by(div_id) %>% | |
arrange(Wpct) %>% | |
mutate(div_rank=row_number()) %>% | |
ungroup() %>% | |
group_by(lg) %>% | |
arrange(-Wpct) %>% | |
mutate(lg_rank=row_number()) %>% ungroup() | |
p2 = plot_df %>% filter(div_rank == 5) | |
p1a = plot_df %>% filter(div_rank < 5, lg=='AL') %>% top_n(2, -lg_rank) | |
p1n = plot_df %>% filter(div_rank < 5, lg=='NL') %>% top_n(2, -lg_rank) | |
plot_df$playoff_bound = 0 | |
cc2 = which(plot_df$Team %in% p2$Team) | |
cc1 = which(plot_df$Team %in% c(p1a$Team, p1n$Team)) | |
plot_df[cc2,]$playoff_bound = 2 | |
plot_df[cc1,]$playoff_bound = 1 | |
plot_df$playoff_bound = factor(plot_df$playoff_bound, levels = c(2,1,0)) | |
ptit = sprintf("MLB Standings - %s", Sys.Date()) | |
p_by_div = plot_df %>% | |
ggplot() + | |
geom_dumbbell(aes(x=Wpct, xend=wpythag, y=div_rank), | |
size_x = 3, size_xend = 1.5, | |
colour_x = "black", colour_xend = "steelblue") + | |
geom_text(aes(x=0.2, y=div_rank, label=Team, color=playoff_bound), | |
hjust=0, size=3.5, nudge_y = 0.2,fontface="bold") + | |
theme_minimal(base_size = 16) + | |
theme(axis.text.y = element_blank()) + | |
labs(x="Win Pct.", y="", title=ptit) + xlim(0.2, 0.8) + | |
scale_x_continuous(sec.axis = sec_axis(~.*162, name = 'Wins per 162 G')) + | |
theme(panel.grid.minor.y = element_blank()) + | |
scale_color_manual(values = c("firebrick3", "royalblue3", "gray24")) + | |
guides(color=FALSE) + | |
geom_vline(xintercept = 0.5, linetype=2, color='gray55') + | |
facet_wrap(~div_id) | |
p_by_lg = plot_df %>% | |
mutate(lg_rank = -lg_rank) %>% | |
# mutate(Team = paste(str_sub(tolower(plot_df$div_id), 4), plot_df$Team)) %>% | |
ggplot() + | |
geom_dumbbell(aes(x=Wpct, xend=wpythag, y=lg_rank), | |
size_x = 3, size_xend = 1.5, | |
colour_x = "black", colour_xend = "steelblue") + | |
geom_text(aes(x=0.2, y=lg_rank, label=Team, color=playoff_bound), | |
hjust=0, size=3.5, nudge_y = 0.2,fontface="bold") + | |
theme_minimal(base_size = 16) + | |
theme(axis.text.y = element_blank()) + | |
labs(x="Win Pct.", y="", title=ptit) + xlim(0.2, 0.8) + | |
scale_x_continuous(sec.axis = sec_axis(~.*162, name = 'Wins per 162 G')) + | |
scale_color_manual(values = c("firebrick3", "royalblue3", "gray24")) + | |
guides(color=FALSE) + | |
geom_vline(xintercept = 0.5, linetype=2, color='gray55') + | |
facet_wrap(~lg) + | |
scale_y_continuous(minor_breaks = seq(-15, -1, 1)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment