Skip to content

Instantly share code, notes, and snippets.

@tonyelhabr
Created July 5, 2021 08:25
Show Gist options
  • Save tonyelhabr/4685f96e7fe4d48427b77544d3f84867 to your computer and use it in GitHub Desktop.
Save tonyelhabr/4685f96e7fe4d48427b77544d3f84867 to your computer and use it in GitHub Desktop.
library(tidyverse)
dir_data <- file.path('inst', 'extdata', 'json')
paths <-
fs::dir_ls(dir_data, regexp = 'json$')
paths
n_path <- paths %>% length()
n_path
# paths <- 'inst/extdata/json/8-2020-1485311.json'
# paths <- 'inst/extdata/json/8-2019-1375927.json'
# Reference: https://adv-r.hadley.nz/function-operators.html
.print_every <- function(f, n = 1, max = NULL, format = 'Function call {cli::bg_black(i)} of {max}.') {
force(f)
force(n)
i <- 0
function(...) {
i <<- i + 1
if (i %% n == 0) {
cat(glue::glue(format), sep = '\n')
}
f(...)
}
}
.parse_init <- function(path, rgx = '([0-9]+)-([0-9]+)-([0-9]+)') {
init <-
path %>%
jsonlite::read_json() %>%
enframe()
name <- path %>% basename() %>% tools::file_path_sans_ext()
.f_replace <- function(i) {
name %>% str_replace(rgx, sprintf('\\%d', i)) %>% as.integer()
}
list(
competition_id = .f_replace(1),
season_id = .f_replace(2),
match_id = .f_replace(3),
init = init
)
}
.parse_meta <- function(lst = NULL, path = NULL, ...) {
if(is.null(lst)) {
lst <- .parse_init(path, ...)
}
meta_init <-
lst$init %>%
rowwise() %>%
filter(length(value) == 1L) %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = value) %>%
janitor::clean_names()
meta <-
meta_init %>%
unnest(cols = names(meta_init)) %>%
mutate(match_id = lst$match_id) %>%
relocate(competition_id, season_id, match_id)
meta
}
.parse_events <- function(lst = NULL, path = NULL, ...) {
if(is.null(lst)) {
lst <- .parse_init(path, ...)
}
events <-
lst$init %>%
filter(name == 'events') %>%
select(value) %>%
unnest(value) %>%
unnest_wider(value)
players <-
lst$init %>%
filter(name == 'playerIdNameDictionary') %>%
select(value) %>%
unnest_longer(value, indices_to = 'player_id', values_to = 'player_name') %>%
mutate(across(player_id, as.integer))
.unnest_side <- function(side) {
lst$init %>%
filter(name == !!side) %>%
select(value) %>%
unnest_wider(value) %>%
janitor::clean_names() %>%
select(team_id, team_name = name, side = field)
}
teams <- c('home', 'away') %>% map_dfr(.unnest_side)
events_clean_init1 <-
events %>%
janitor::clean_names() %>%
hoist(period, 'period' = 'value', 'period_name' = 'displayName') %>%
hoist(type, 'type' = 'value', 'type_name' = 'displayName') %>%
hoist(outcome_type, 'outcome_type' = 'value', 'outcome_type_name' = 'displayName')
if(any('cardType' == names(events))) {
# There's some weird thing here with `cardType` compared to the other elements that are hoisted.
# Since `cardType` is NULL by default while the others are always named lists, automatically renaming
# a hoisted element to the same name of the column that is being hoisted drops the column; one way
# to avoid this is to temporarily name the hoisted column to something different (in this case `cardTypex`).
events_clean_init1 <-
events_clean_init1 %>%
hoist(card_type, 'card_typex' = 'value', 'card_type_name' = 'displayName') %>%
rename(cardtype = card_typex)
}
events_clean_init2 <-
events_clean_init1 %>%
left_join(teams, by = 'team_id') %>%
left_join(players, by = 'player_id') %>%
mutate(competition_id = lst$competition_id, season_id = lst$season_id, match_id = lst$match_id) %>%
relocate(competition_id, season_id, match_id)
events_clean <-
events_clean_init2 %>%
select(-c(qualifiers, satisfied_events_types)) %>%
arrange(minute, event_id)
events_clean
}
.parse_match <- function(path) {
lst <- .parse_init(path)
meta <- .parse_meta(lst)
events <- .parse_events(lst)
# res <-
# events %>%
# bind_cols(
# meta %>% select(league_id, season_id, match_id)
# )
# list(events = events, meta = meta)
events
}
f_v <- .print_every(.parse_match, max = n_path)
res <-
paths %>%
tibble(path = .) %>%
mutate(data = map(path, f_v)) %>%
select(data) %>%
unnest(data)
res
res %>%
filter(type_name %>% str_detect('Sub')) %>%
relocate(player_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment