library(httr)
library(tibble)
library(tidyr)
library(dplyr)
library(purrr)
library(janitor)
library(cli)
library(stringr)
library(readr)
data_dir <- '/path/to/data'
match_stats_dir <- file.path(data_dir, 'match-stats')
live_match_elements_dir <- file.path(data_dir, 'live-match-elements')
dir.create(match_stats_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(live_match_elements_dir, showWarnings = FALSE)
season_id <- 285026
Scrape match info.
matches_resp <- GET(sprintf('https://api.fifa.com/api/v3/calendar/matches?language=en&count=500&idSeason=%s', season_id))
results <- content(matches_resp) |> pluck('Results')
## for incomplete matches (anything beyond the group stage at the moment), there will be `NULL`s
## which causes `pluck()` to throw an error. using a `.default` of `NA` fixes the issue.
pluck2_chr <- partial(pluck, .default = NA_character_, ... = )
pluck2_int <- partial(pluck, .default = NA_integer_, ... = )
map_pluck_chr <- function(x, ...) {
map_chr(x, pluck2_chr, ...)
}
map_pluck_int <- function(x, ...) {
map_int(x, pluck2_int, ...)
}
matches <- tibble(
competition_id = map_pluck_chr(results, 'IdCompetition'),
season_id = map_pluck_chr(results, 'IdSeason'),
stage_id = map_pluck_chr(results, 'IdStage'),
group_id = map_pluck_chr(results, 'IdGroup'),
## this won't join with the match stats, but it seems to be Fifa's "true" match ID
match_id = map_pluck_chr(results, 'IdMatch'),
match_status = map_pluck_int(results, 'MatchStatus'),
## use this to join with the match stats
result_id = map_pluck_chr(results, 'Properties', 'IdIFES'),
home_abbr = map_pluck_chr(results, 'Home', 'Abbreviation'),
away_abbr = map_pluck_chr(results, 'Away', 'Abbreviation')
) |>
filter(match_status == 0L) |>
select(-match_status)
matches
# A tibble: 2 × 8
competition_id season_id stage_id group_id match_id result_id home_…¹ away_…²
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 103 285026 285033 285037 400222852 131872 NZL NOR
2 103 285026 285033 285036 400222851 131878 AUS IRL
# … with abbreviated variable names ¹home_abbr, ²away_abbr
Scrape stats for players in matches.
scrape_match_stats <- function(result_id) {
stats_resp <- GET(sprintf('https://fdh-api.fifa.com/v1/stats/match/%s/players.json', result_id))
stop_for_status(stats_resp)
stats_resp |>
content() |>
enframe('player_id', 'values') |>
unnest_longer(values)
}
scrape_and_save_match_stats <- function(result_id) {
path <- file.path(match_stats_dir, paste0(result_id, '.rds'))
if (file.exists(path)) {
return(read_rds(path))
}
cli_inform('Scraping {result_id}.')
res <- scrape_match_stats(result_id)
write_rds(res, path)
res
}
match_stats <- matches |>
pull(result_id) |>
map_dfr(
~{
scrape_and_save_match_stats(.x) |>
mutate(result_id = !!.x, .before = 1)
}
)
match_stats
# A tibble: 5,155 × 3
result_id player_id values
<chr> <chr> <list>
1 131872 467661 <list [3]>
2 131872 467661 <list [3]>
3 131872 467661 <list [3]>
4 131872 467661 <list [3]>
5 131872 467661 <list [3]>
6 131872 467661 <list [3]>
7 131872 467661 <list [3]>
8 131872 467661 <list [3]>
9 131872 467661 <list [3]>
10 131872 467661 <list [3]>
# … with 5,145 more rows
# ℹ Use `print(n = ...)` to see more rows
Get info for players (warning: this is obnoxious)
generate_live_match_url <- function(competition_id, season_id, stage_id, match_id) {
sprintf(
'https://api.fifa.com/api/v3/live/football/%s/%s/%s/%s?language=en',
competition_id,
season_id,
stage_id,
match_id
)
}
scrape_live_match_elements <- function(url) {
resp <- GET(url)
elements <- content(resp) |>
enframe('element', 'values')
}
scrape_and_save_live_match_elements <- function(url, result_id, overwrite = FALSE) {
path <- file.path(live_match_elements_dir, paste0(result_id, '.rds'))
if (file.exists(path) & isFALSE(overwrite)) {
return(read_rds(path))
}
cli_inform('Scraping {result_id}.')
res <- scrape_live_match_elements(url)
write_rds(res, path)
res
}
live_match_elements <- matches |>
mutate(
live_match_url = generate_live_match_url(
competition_id = competition_id,
season_id = season_id,
stage_id = stage_id,
match_id = match_id
)
) |>
pull(live_match_url, result_id) |>
imap_dfr(
~{
scrape_and_save_live_match_elements(
url = ..1,
result_id = ..2
) |>
mutate(
result_id = !!..2,
.before = 1
)
}
)
live_match_teams <- live_match_elements |>
filter(
element %in% c(
'HomeTeam',
'AwayTeam'
)
) |>
unnest_wider(values)
players <- live_match_teams |>
transmute(
country = ShortClubName,
country_picture_url = str_replace_all(PictureUrl, c('\\{format\\}' = 'sq', '\\{size\\}' = '4')),
Players
) |>
unnest_longer(Players) |>
unnest_wider(Players) |>
unnest_wider(where(is.list), names_sep = '_') |>
unnest_wider(where(is.list), names_sep = '_') |>
distinct(
player_id = IdPlayer,
player_name = PlayerName_1_Description,
player_picture_url = PlayerPicture_PictureUrl,
country,
country_picture_url
)
players
# A tibble: 92 × 5
player_id player_name player_picture_url country count…¹
<chr> <chr> <chr> <chr> <chr>
1 301468 Victoria ESSON https://digitalhub.fifa.com/trans… New Ze… https:…
2 358189 C.J. BOTT https://digitalhub.fifa.com/trans… New Ze… https:…
3 252502 Ali RILEY https://digitalhub.fifa.com/trans… New Ze… https:…
4 355896 Rebekah STOTT https://digitalhub.fifa.com/trans… New Ze… https:…
5 301461 Katie BOWEN https://digitalhub.fifa.com/trans… New Ze… https:…
6 252519 Ria PERCIVAL https://digitalhub.fifa.com/trans… New Ze… https:…
7 397010 Malia STEINMETZ https://digitalhub.fifa.com/trans… New Ze… https:…
8 298792 Betsy HASSETT https://digitalhub.fifa.com/trans… New Ze… https:…
9 395309 Jacqui HAND https://digitalhub.fifa.com/trans… New Ze… https:…
10 321169 Hannah WILKINSON https://digitalhub.fifa.com/trans… New Ze… https:…
# … with 82 more rows, and abbreviated variable name ¹country_picture_url
# ℹ Use `print(n = ...)` to see more rows
Combine everything
unnested_match_stats <- match_stats |>
hoist(
values,
'stat' = 1,
'value' = 2
) |>
select(-values)
player_stats <- unnested_match_stats |>
group_by(result_id, player_id, stat) |>
slice_max(value, n = 1, with_ties = FALSE) |>
ungroup() |>
pivot_wider(
names_from = stat,
values_from = value,
values_fill = 0
) |>
clean_names()
player_stats
Rows: 92
Columns: 89
$ result_id <chr> "131872", "131872",…
$ player_id <chr> "252502", "252503",…
$ assists <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal <dbl> 0, 0, 2, 0, 0, 0, 0…
$ attempt_at_goal_blocked <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_from_free_kicks <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_inside_the_penalty_area <dbl> 0, 0, 1, 0, 0, 0, 0…
$ attempt_at_goal_inside_the_penalty_area_on_target <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_off_target <dbl> 0, 0, 2, 0, 0, 0, 0…
$ attempt_at_goal_on_target <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_outside_the_penalty_area <dbl> 0, 0, 1, 0, 0, 0, 0…
$ attempt_at_goal_outside_the_penalty_area_on_target <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempted_ball_progressions <dbl> 2, 0, 0, 1, 0, 5, 2…
$ attempted_switches_of_play <dbl> 0, 0, 0, 0, 0, 0, 0…
$ avg_speed <dbl> 5.99, 0.00, 6.43, 5…
$ completed_ball_progressions <dbl> 2, 0, 0, 0, 0, 4, 2…
$ completed_switches_of_play <dbl> 0, 0, 0, 0, 0, 0, 0…
$ corners <dbl> 0, 0, 8, 0, 0, 0, 0…
$ crosses <dbl> 1, 0, 9, 0, 0, 2, 0…
$ crosses_completed <dbl> 0, 0, 1, 0, 0, 0, 0…
$ defensive_pressures_applied <dbl> 21, 0, 16, 10, 0, 3…
$ direct_defensive_pressures_applied <dbl> 9, 0, 8, 3, 0, 9, 0…
$ direct_free_kicks <dbl> 0, 0, 0, 0, 0, 0, 4…
$ distance_high_speed_running <dbl> 2172.3336, 0.0000, …
$ distance_high_speed_sprinting <dbl> 220.84722, 0.00000,…
$ distance_jogging <dbl> 3625.9098, 0.0000, …
$ distance_low_speed_sprinting <dbl> 507.948606, 0.00000…
$ distance_walking <dbl> 3511.4708, 0.0000, …
$ distributions_completed_under_pressure <dbl> 9, 0, 12, 4, 0, 17,…
$ distributions_under_pressure <dbl> 14, 0, 16, 8, 0, 30…
$ fouls_against <dbl> 0, 0, 1, 1, 0, 0, 0…
$ fouls_for <dbl> 0, 0, 2, 0, 0, 0, 0…
$ free_kicks <dbl> 0, 0, 0, 0, 0, 0, 4…
$ goal_kicks <dbl> 0, 0, 0, 0, 0, 0, 1…
$ goalkeeper_defensive_actions_inside_penalty_area <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goalkeeper_defensive_actions_outside_penalty_area <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goalkeeper_goal_preventions <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_conceded_from_attempt_at_goal_against <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_from_direct_free_kicks <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_inside_the_penalty_area <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_outside_the_penalty_area <dbl> 0, 0, 0, 0, 0, 0, 0…
$ headed_attempt_at_goal <dbl> 0, 0, 0, 0, 0, 0, 0…
$ indirect_free_kicks <dbl> 0, 0, 0, 0, 0, 0, 0…
$ linebreaks_attempted <dbl> 17, 0, 14, 14, 0, 1…
$ linebreaks_attempted_all_lines <dbl> 1, 0, 1, 0, 0, 3, 0…
$ linebreaks_attempted_attacking_and_midfield_line <dbl> 3, 0, 1, 5, 0, 3, 1…
$ linebreaks_attempted_attacking_line <dbl> 5, 0, 2, 7, 0, 3, 1…
$ linebreaks_attempted_attacking_line_completed <dbl> 3, 0, 1, 3, 0, 1, 7…
$ linebreaks_attempted_completed <dbl> 9, 0, 10, 6, 0, 12,…
$ linebreaks_attempted_defensive_line <dbl> 1, 0, 2, 0, 0, 5, 0…
$ linebreaks_attempted_defensive_line_completed <dbl> 0, 0, 1, 0, 0, 3, 0…
$ linebreaks_attempted_midfield_and_defensive_line <dbl> 1, 0, 1, 0, 0, 3, 0…
$ linebreaks_attempted_midfield_line <dbl> 11, 0, 10, 7, 0, 11…
$ linebreaks_attempted_midfield_line_completed <dbl> 6, 0, 8, 3, 0, 8, 5…
$ linebreaks_attempted_under_pressure <dbl> 3, 0, 12, 2, 0, 16,…
$ linebreaks_completed_all_lines <dbl> 0, 0, 1, 0, 0, 1, 0…
$ linebreaks_completed_attacking_and_midfield_line <dbl> 3, 0, 1, 5, 0, 3, 1…
$ linebreaks_completed_midfield_and_defensive_line <dbl> 0, 0, 1, 0, 0, 1, 0…
$ linebreaks_completed_under_pressure <dbl> 1, 0, 10, 0, 0, 9, …
$ offers_to_receive_in_behind <dbl> 1, 0, 9, 0, 0, 6, 2…
$ offers_to_receive_in_between <dbl> 7, 0, 3, 0, 0, 23, …
$ offers_to_receive_in_front <dbl> 12, 0, 4, 4, 0, 2, …
$ offers_to_receive_inside <dbl> 1, 0, 9, 2, 0, 14, …
$ offers_to_receive_outside <dbl> 19, 0, 7, 2, 0, 17,…
$ offers_to_receive_total <dbl> 20, 0, 16, 4, 0, 31…
$ offsides <dbl> 0, 0, 0, 0, 0, 1, 0…
$ own_goals <dbl> 0, 0, 0, 0, 0, 0, 0…
$ passes <dbl> 37, 0, 22, 27, 0, 3…
$ passes_completed <dbl> 29, 0, 18, 21, 0, 2…
$ penalties <dbl> 0, 0, 1, 0, 0, 0, 0…
$ penalties_scored <dbl> 0, 0, 0, 0, 0, 0, 0…
$ received_offers_to_receive <dbl> 9, 0, 4, 3, 0, 9, 7…
$ receptions_between_midfield_and_defensive_line <dbl> 3, 0, 8, 1, 0, 13, …
$ receptions_under_direct_pressure <dbl> 0, 0, 3, 1, 0, 4, 0…
$ receptions_under_indirect_pressure <dbl> 6, 0, 12, 6, 0, 12,…
$ receptions_under_no_pressure <dbl> 21, 0, 12, 22, 0, 2…
$ receptions_under_pressure <dbl> 6, 0, 15, 7, 0, 16,…
$ red_cards <dbl> 0, 0, 0, 0, 0, 0, 0…
$ speed_runs <dbl> 165, 0, 183, 149, 0…
$ sprints <dbl> 41, 0, 40, 32, 0, 6…
$ substitutions_in <dbl> 0, 0, 0, 0, 0, 0, 0…
$ substitutions_out <dbl> 0, 0, 0, 0, 0, 0, 0…
$ take_ons_completed <dbl> 0, 0, 0, 0, 0, 0, 0…
$ throw_ins <dbl> 14, 0, 0, 0, 0, 0, …
$ time_played <dbl> 100, 0, 100, 100, 0…
$ top_speed <dbl> 28.77, 0.00, 27.15,…
$ total_distance <dbl> 10038.51, 0.00, 107…
$ yellow_cards <dbl> 0, 0, 0, 0, 0, 0, 0…
Bonus: per 90 stats
player_stats_p90 <- player_stats |>
group_by(player_id) |>
summarize(
n_matches = n_distinct(result_id),
across(
-c(result_id),
sum
)
) |>
ungroup() |>
mutate(
across(
-c(player_id, time_played),
list(p90 = \(.x) 90 * .x / time_played)
)
) |>
left_join(
players,
by = join_by(player_id)
)
nerdsniped by you mentioning "warning: this is obnoxious"