Last active
June 30, 2022 06:25
-
-
Save meghall06/e125bc9e99446b4a09c7598cd47cb4a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# to create the plot in this tweet: https://twitter.com/MeghanMHall/status/1518397321150664705 | |
library(tidyverse) | |
library(hockeyR) | |
library(glue) | |
library(ggtext) | |
library(showtext) | |
font_add_google(name = "Jost", family = "jost") | |
showtext_auto() | |
# initial data prep---- | |
pbp_raw <- load_pbp("2021-22") | |
pbp <- pbp_raw %>% | |
mutate(event_length = ifelse(lead(game_id) == game_id, | |
lead(game_seconds) - game_seconds, 0)) %>% | |
rename(home_team = home_abbreviation, | |
away_team = away_abbreviation) | |
# I like the NHL rosters better than the hockey-ref ones | |
game_ids <- pbp %>% | |
select(game_id) %>% | |
mutate(game_id = as.character(game_id)) %>% | |
unique() | |
roster_fn <- function(id) { | |
hockeyR::get_game_rosters(jsonlite::read_json(glue::glue("http://statsapi.web.nhl.com/api/v1/game/{id}/feed/live"))) | |
} | |
rosters <- purrr::map_df(game_ids$game_id, roster_fn) %>% | |
unique() | |
positions <- rosters %>% | |
mutate(player = str_replace_all(player_name, "'", " "), | |
player = str_replace_all(player, "\\.", " "), | |
player = str_replace_all(player, "-", " "), | |
player = ifelse(player_id == 8480222, "Sebastian AhoD", | |
player)) %>% | |
select(player, position_type) %>% | |
unique() | |
# fix the Marc Andre Fleury problem---- | |
# for some reason MAF counts as a skater, not a goalie, for the on-ice variables | |
# and those drive skater counts and those drive strength state... | |
# so CHI is never depicted with the correct strength state ¯\_(ツ)_/¯ | |
# also happens to Ukko Pekka Luukkonen and Cam Talbot | |
# noticed this pretty early on, that a few goalies were "on ice" | |
# wouldn't be a big deal, except for it affects strength state... | |
# can see that CHI only spends 38% of the time @ 5v5, which is def wrong | |
goalie_issue <- pbp %>% | |
select(home_team, away_team, home_skaters, away_skaters, event_length) %>% | |
filter(event_length > 0) %>% | |
pivot_longer(home_team:away_skaters, | |
names_to = c(NA, ".value"), | |
names_sep = 5) %>% | |
group_by(team, skaters) %>% | |
summarize(time = sum(event_length)) %>% | |
add_tally(wt = time) %>% | |
mutate(perc = round(time / n * 100, 2)) %>% | |
filter(skaters == 5) %>% | |
arrange(perc) | |
pbp <- pbp %>% | |
mutate(across(contains("_on_"), ~ ifelse(.x %in% c("Marc.Andre.Fleury", | |
"Ukko.Pekka.Luukkonen", | |
"Cam.Talbot", | |
"J.F.Berube"), NA, .x))) %>% | |
# why does this calc only work if it's in a separate mutate function... | |
mutate(home_skaters = rowSums(!is.na(select(., contains("home_on")))), | |
away_skaters = rowSums(!is.na(select(., contains("away_on")))), | |
home_v_away = str_c(home_skaters, "v", away_skaters)) | |
# players by game---- | |
# to have a record of which players played in which game | |
player_by_game <- pbp %>% | |
select(game_id, home_team, home_on_1:home_on_7, | |
away_team, away_on_1:away_on_7) %>% | |
rename_with(~str_replace(., "_on", ""), .cols = contains("on")) %>% | |
pivot_longer(home_team:away_7, | |
names_to = c("home_away", ".value"), | |
names_pattern = "(.+)_(.+)", | |
values_drop_na = TRUE) %>% | |
pivot_longer(`1`:`7`, | |
names_to = NULL, | |
values_to = "player", | |
values_drop_na = TRUE) %>% | |
unique() %>% | |
mutate(player = str_replace_all(player, "\\.", " ")) %>% | |
group_by(player, team) %>% | |
mutate(games = n_distinct(game_id)) | |
# 5v5---- | |
# trailing OZ faceoffs only in the third | |
five_v_five <- pbp %>% | |
filter(home_v_away == "5v5" & home_score != away_score & | |
event_type == "FACEOFF" & period == 3) %>% | |
mutate(trailing_OZ = case_when(home_score < away_score & x_fixed == 69 ~ 1, | |
away_score < home_score & x_fixed == -69 ~ 1, | |
TRUE ~ 0)) %>% | |
filter(trailing_OZ == 1) %>% | |
select(home_score, home_team, away_score, away_team, home_on_1:home_on_5, | |
away_on_1:away_on_5, trailing_OZ, game_id) %>% | |
mutate(trailing_team = ifelse(home_score < away_score, home_team, away_team), | |
trailing_amt = abs(home_score - away_score), | |
trailing1 = ifelse(home_score < away_score, home_on_1, away_on_1), | |
trailing2 = ifelse(home_score < away_score, home_on_2, away_on_2), | |
trailing3 = ifelse(home_score < away_score, home_on_3, away_on_3), | |
trailing4 = ifelse(home_score < away_score, home_on_4, away_on_4), | |
trailing5 = ifelse(home_score < away_score, home_on_5, away_on_5)) %>% | |
select(trailing_team:trailing5, game_id) %>% | |
group_by(trailing_team, game_id) %>% | |
mutate(no = row_number()) | |
# count of those faceoffs per team per game | |
game_sums_ozf <- five_v_five %>% | |
group_by(trailing_team, game_id) %>% | |
summarize(ozf_team = max(no)) | |
# count of those faceoffs per player per game | |
player_sums_ozf <- five_v_five %>% | |
pivot_longer(trailing1:trailing5, names_to = NULL, values_to = "player") %>% | |
count(trailing_team, player, game_id, name = "ozf") %>% | |
mutate(player = str_replace_all(player, "\\.", " "), | |
player = ifelse(player == "Sebastian Aho" & trailing_team == "NYI", | |
"Sebastian AhoD", player)) | |
# start with the original players list, add in the player sums as well as the | |
# game sums | |
ozf <- player_by_game %>% | |
left_join(player_sums_ozf, by = c("player" = "player", | |
"team" = "trailing_team", | |
"game_id" = "game_id")) %>% | |
left_join(select(positions, player, position_type), by = "player") %>% | |
left_join(game_sums_ozf, by = c("team" = "trailing_team", | |
"game_id" = "game_id")) %>% | |
mutate(ozf = replace_na(ozf, 0), | |
ozf_team = replace_na(ozf_team, 0)) %>% | |
group_by(player, position_type, team, games) %>% | |
summarize(ozf = sum(ozf), | |
ozf_team = sum(ozf_team)) %>% | |
mutate(ozf_perc = ozf / ozf_team) | |
# PK---- | |
pk <- pbp %>% | |
filter(home_v_away %in% c("4v5","5v4")) %>% | |
mutate(pp_team = ifelse(away_skaters == 4, home_team, away_team), | |
pk_team = ifelse(away_skaters == 4, away_team, home_team), | |
pk_1 = ifelse(away_skaters == 4, away_on_1, home_on_1), | |
pk_2 = ifelse(away_skaters == 4, away_on_2, home_on_2), | |
pk_3 = ifelse(away_skaters == 4, away_on_3, home_on_3), | |
pk_4 = ifelse(away_skaters == 4, away_on_4, home_on_4), | |
shot_event = case_when(!(event_type %in% c("GOAL","SHOT","BLOCKED_SHOT", | |
"MISSED_SHOT")) ~ NA_character_, | |
event_team_abbr == pp_team ~ "PP", | |
event_team_abbr == pk_team ~ "PK")) %>% | |
filter(event_length > 0 | event_type %in% c("GOAL","SHOT","BLOCKED_SHOT", | |
"MISSED_SHOT","FACEOFF")) %>% | |
select(game_id, event_type, event_idx, event_team = event_team_abbr, | |
shot_event, pp_team:pk_4, event_length) | |
# sum of PK time per team per game | |
game_sums_pk <- pk %>% | |
group_by(pk_team, game_id) %>% | |
summarize(time_team = sum(event_length)) | |
# sum of PK time per player per game | |
player_sums_pk <- pk %>% | |
pivot_longer(pk_1:pk_4, names_to = NULL, values_to = "player") %>% | |
group_by(pk_team, player, game_id) %>% | |
summarize(time = sum(event_length)) %>% | |
mutate(player = str_replace_all(player, "\\.", " "), | |
player = ifelse(player == "Sebastian Aho" & pk_team == "NYI", | |
"Sebastian AhoD", player)) | |
# start with the original players list, add in the player sums as well as the | |
# game sums | |
pk_time <- player_by_game %>% | |
left_join(player_sums_pk, by = c("player" = "player", | |
"team" = "pk_team", | |
"game_id" = "game_id")) %>% | |
left_join(select(positions, player, position_type), by = "player") %>% | |
left_join(game_sums_pk, by = c("team" = "pk_team", | |
"game_id" = "game_id")) %>% | |
mutate(time = replace_na(time, 0), | |
time_team = replace_na(time_team, 0)) %>% | |
group_by(player, position_type, team, games) %>% | |
summarize(time = sum(time), | |
time_team = sum(time_team)) %>% | |
mutate(time_perc = time / time_team) | |
# initial plots---- | |
plot <- ozf %>% | |
left_join(pk_time, by = c("player","position_type","team","games")) %>% | |
filter(games > 20 & !is.na(position_type)) | |
names <- hockeyR::team_logos_colors | |
focus_team <- plot %>% | |
filter(team %in% c("TOR","CAR","CBJ","STL")) %>% | |
mutate(label = case_when(ozf_perc > 0.4 & time_perc > 0.39 ~ player, | |
TRUE ~ NA_character_)) %>% | |
left_join(select(names, team = team_abbr, full_team_name), by = "team") | |
gray <- plot %>% | |
ungroup() %>% | |
select(-team) | |
focus_team %>% | |
ggplot(aes(x = ozf_perc, y = time_perc)) + | |
geom_point(data = gray, color = "grey", alpha = 0.5) + | |
geom_point(size = 2.5, color = "#172A3A") + | |
facet_wrap(~full_team_name) + | |
geom_text(size = 4, mapping = aes(label = label), family = "jost", | |
hjust = -0.1) + | |
scale_y_continuous(labels = scales::percent) + | |
scale_x_continuous(labels = scales::percent) + | |
labs(title = "Personnel on the penalty kill, 2021-22 NHL season through April 7", | |
y = "% of time on the PK", | |
x = "% of 5v5 high-leverage offensive faceoffs", | |
subtitle = "@MeghanMHall, data from hockeyR") + | |
guides(color = "none") + | |
theme_bw() + | |
theme(text = element_text(family = "jost"), | |
panel.background = element_blank(), | |
panel.grid.major = element_line(color = "grey90", size = 0.3), | |
strip.background = element_blank(), | |
strip.text = element_textbox(size = 11, color = "white", fill = "#172A3A", | |
halign = 0.5, linetype = 1, r = unit(5, "pt"), width = unit(1, "npc"), | |
padding = margin(3, 0, 3, 0), margin = margin(0, 2, 0, 2)), | |
panel.border = element_rect(color = "black", fill = NA, size = 0.5), | |
axis.ticks = element_blank(), | |
plot.caption.position = "plot", | |
plot.caption = element_text(hjust = 0, vjust = -1), | |
plot.subtitle = element_text(face = 3, size = 9)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment