Skip to content

Instantly share code, notes, and snippets.

@tonyelhabr
Created December 23, 2024 04:50
Show Gist options
  • Save tonyelhabr/bc0150f2d76d7b2e2efad5fc6cf30a44 to your computer and use it in GitHub Desktop.
Save tonyelhabr/bc0150f2d76d7b2e2efad5fc6cf30a44 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(rvest)
library(janitor)
page <- read_html('https://mychiptime.com/searchevent.php?id=15902#%206')
raw_table <- page |> html_table() |> pluck(1)
melted_table <- raw_table |>
mutate(
row = row_number()
) |>
pivot_longer(
-row,
names_to = 'col',
values_to = 'value'
) |>
mutate(
col = as.integer(gsub('^X', '', col))
)
filt_melted_table <- melted_table |>
# last col_label has nothing
filter(col != max(col))
column_header_rows <- filt_melted_table |>
filter(col == 1, value == 'Place') |>
pull(row)
column_mapping <- filt_melted_table |>
filter(row == column_header_rows[1]) |>
select(col, col_label = value) |>
group_by(col_label) |>
mutate(
rn = row_number(),
max_rn = max(rn)
) |>
ungroup() |>
transmute(
col,
col_label = case_when(
col_label == 'Time' & rn == 5 ~ 'chip_time',
col_label == 'Time' & rn == 6 ~ 'gun_time',
max_rn >= 4 ~ paste0(col_label, '_', rn),
TRUE ~ col_label
)
)
group_labels <- filt_melted_table |>
filter(
col == 1,
# data_type == 'character',
grepl('^[A-Z]', value),
!(row %in% column_header_rows),
!(value %in% c('DNF', 'DQ', 'Top'))
) |>
distinct(row, group = value)
result_rows <- filt_melted_table |>
filter(
col == 1,
grepl('^[1-9]', value)
) |>
pull(row) |>
unique()
wide_table <- filt_melted_table |>
filter(row %in% result_rows) |>
left_join(
column_mapping |>
select(col, col_label),
by = join_by(col)
) |>
select(
row,
col_label,
value
) |>
pivot_wider(
names_from = col_label,
values_from = value
) |>
clean_names()
wide_table_with_groups <- wide_table |>
left_join(
group_labels |>
select(
group_label_row = row,
group
) |>
mutate(
next_group_label_row = lead(group_label_row, default = max(wide_table$row) + 1L)
),
by = join_by(
row >= group_label_row,
row < next_group_label_row
)
) |>
relocate(
group,
.after = row
) |>
select(
-c(group_label_row, next_group_label_row)
)
repivoted_table <- wide_table_with_groups |>
pivot_longer(
matches('_[1-4]$'),
names_to = 'leg_key',
values_to = 'leg_value'
) |>
separate_wider_delim(
leg_key,
delim = '_',
names = c('leg_key', 'leg')
) |>
pivot_wider(
names_from = leg_key,
values_from = leg_value
)
parse_pace <- function(x) {
time_str <- gsub('\\/M', '', x)
parsed_time <- parse_time(time_str)
as.integer(parsed_time) / 60
}
clean_table <- repivoted_table |>
mutate(
pace = parse_pace(pace),
# across(
# c(
# chip_time,
# gun_time
# )
# )
across(
c(
place,
bib_no,
age,
leg,
rnk
),
\(.x) as.integer(.x)
)
)
clean_table |>
count(name, sort = TRUE) |>
count(n, sort = TRUE)
convert_seconds_to_time <- function(x) {
minutes <- x %/% 60
seconds <- x %% 60
sprintf('%0.2d:%05.2f', minutes, seconds)
}
pace_breakdown <- clean_table |>
group_by(place, name, group) |>
summarize(
across(
pace,
list(
# var = \(.x) var(.x),
# rng = \(.x) max(.x) - min(.x),
# mean = \(.x) mean(.x),
mad = \(.x) mean(abs(.x - mean(.x))),
top2 = \(.x) mean(sort(.x, decreasing = FALSE)[1:2]),
bot2 = \(.x) mean(sort(.x, decreasing = TRUE)[1:2])
),
.names = '{.fn}'
)
) |>
ungroup() |>
mutate(
across(
-c(place, name, group),
list(str = \(.x) convert_seconds_to_time(.x))
)
) |>
arrange(desc(mad))
pace_breakdown |>
filter(
group != 'JUST FOR FUN'
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment