Created
November 30, 2021 23:25
-
-
Save Tadge-Analytics/22079c9f5af4ec385427506a498ca37b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(readxl) | |
################################################################### | |
path_to_workbook <- "top_female_poker_players_and_events.xlsx" | |
workbook_sheets <- excel_sheets(path_to_workbook) | |
all_sheets_imported <- | |
workbook_sheets %>% | |
tibble(sheet = .) %>% | |
mutate(sheet_data = map(sheet, ~read_excel(path_to_workbook, sheet = .x, col_types = "text") %>% janitor::clean_names()), | |
sheet = snakecase::to_snake_case(sheet)) | |
map2(all_sheets_imported$sheet, all_sheets_imported$sheet_data, ~assign(.x, .y, envir = .GlobalEnv)) | |
rm(list = setdiff(ls(), all_sheets_imported$sheet)) | |
################################################################### | |
# here, I make three distinct "tidy" tables (unique_players, unique_events, all_events_only_req_info) | |
# just to "get my head around" the data properly | |
unique_events <- | |
top_100_poker_events %>% | |
distinct(event_name, event_date, event_country) %>% | |
mutate(event_date = openxlsx::convertToDate(event_date), | |
event_id = row_number(), .before = 1) | |
unique_players <- | |
top_100 %>% | |
select(player_id, name, country, all_time_money_usd, position) %>% | |
mutate(player_id = as.numeric(player_id), | |
all_time_money_usd = as.numeric(all_time_money_usd), | |
position = parse_number(position)) | |
all_events_only_req_info <- | |
top_100_poker_events %>% | |
mutate(event_date = openxlsx::convertToDate(event_date)) %>% | |
inner_join(unique_events, by = c("event_name", "event_date", "event_country")) %>% | |
select(-event_name, -event_country, -source, -last_updated) %>% | |
mutate(player_place = parse_number(player_place), | |
was_event_winner = player_place == 1, # Create a column to count when the player finished 1st in an event | |
across(c(prize_usd, player_id), ~as.numeric(.x)), | |
prize_usd = replace_na(prize_usd, 0)) # Replace any nulls in prize_usd with zero | |
################################################################### | |
# generate aggregations | |
aggregated_player_stats <- | |
all_events_only_req_info %>% | |
inner_join(unique_events %>% select(-event_date), by = "event_id") %>% | |
group_by(player_id) %>% | |
summarise(first_event = min(event_date), # Find the dates of the players first and last events | |
latest_event = max(event_date), | |
career_length_yrs = round(as.numeric((latest_event - first_event))/365, 1), # calculate the length of poker career in years (with decimals) | |
total_prize_money = sum(prize_usd), # Total prize money | |
biggest_win = max(prize_usd), # Their biggest win | |
total_events = n(), # Number of events they've taken part in | |
events_won = sum(was_event_winner) %>% replace_na(0), | |
percentage_of_events_won = paste0(sprintf("%.1f", 100*(events_won/total_events)), "%"), # The percentage of events they've won | |
countries_visited = n_distinct(event_country) # The distinct count of the countries played in | |
) | |
# Reduce the data to name, number of events, total prize money, biggest win, percentage won, countries visited, career length | |
unique_players_with_stats <- | |
unique_players %>% | |
inner_join(aggregated_player_stats, by = "player_id") | |
# save to csv | |
unique_players_with_stats %>% | |
write_csv("output.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment