library(tidyverse)
# Setting-up example hiearchy data
rep_unlist <- function(vec, n){
map(vec, rep, n) %>% unlist()
}
truth <- tibble(
country = rep_unlist(c("USA", "Canada"), 4),
state = rep_unlist(c("California", "Texas", "British Columbia", "Quebec"), 2),
city = c("San Francisco", "Los Angeles", "Austin", "Dallas", "Vancouver", "Whistler", "Montreal", "Gatineau")
)
set.seed(12)
data_messy <- truth %>%
mutate(state = map(state,
~ c(rep(.x, 3), sample(state, 2, replace = TRUE)) %>% sample(5)),
city = map(city, ~ c(rep(.x, 4), sample(city, 1, replace = TRUE)) %>% sample(5))) %>%
# select(country, where(is.list)) %>%
unnest(c(state, city)) %>%
mutate(weights = rnorm(n(), 1, 0.2)) %>%
select(country, state, city, weights)
# kinda like `dplyr::count()` but can use on a vector
count_vec <- function(wt){
if(is.null(wt)) return( n() )
sum(wt)
}
# given dataframe and multiple column selections keep the most frequent value of the last column (in it's grouped structure)
keep_majority <- function(data, vars_hiearchy, wt = NULL){
options(dplyr.summarise.inform = FALSE)
data %>%
group_by(across( {{vars_hiearchy}} )) %>%
summarise(n = count_vec( {{wt }} )) %>%
# depends on fact that summarise() drops last group
filter(n == max(n)) %>%
ungroup() %>%
select(-n)
}
# hiearchy_vars_c should be character string of at least length of 2
clean_hiearchy <- function(data, hiearchy_vars, wt = NULL, prefix = ""){
# Don't know how to do the vars set-up with expression but want to facilitate
# use of tidyselection, so falling back on this crutch :-(
hiearchy_vars_c <- select(data, {{hiearchy_vars}} ) %>% names()
len_vars <- length(hiearchy_vars_c)
vars_seqs <- map2(2:len_vars, seq_len(len_vars - 1), ~hiearchy_vars_c[c(.x, .y)])
suppressMessages(
map(vars_seqs, ~keep_majority(data, one_of(.x)), wt) %>%
reduce(left_join ) %>%
select(one_of(hiearchy_vars_c)) %>%
rename_with(~paste0(prefix, .x))
)
}
print(data_messy, n = 40)
#> # A tibble: 40 x 4
#> country state city weights
#> <chr> <chr> <chr> <dbl>
#> 1 USA Quebec San Francisco 1.09
#> 2 USA California Vancouver 1.40
#> 3 USA California San Francisco 0.790
#> 4 USA California San Francisco 1.15
#> 5 USA California San Francisco 1.11
#> 6 USA California Los Angeles 0.737
#> 7 USA California Los Angeles 0.950
#> 8 USA California Los Angeles 1.06
#> 9 USA California Gatineau 1.08
#> 10 USA British Columbia Los Angeles 1.20
#> 11 USA Texas Austin 1.17
#> 12 USA Texas Austin 1.04
#> 13 USA Texas Austin 1.17
#> 14 USA Texas Austin 1.17
#> 15 USA Texas Austin 1.39
#> 16 USA Quebec Austin 0.570
#> 17 USA Texas Dallas 1.19
#> 18 USA California Dallas 1.23
#> 19 USA Texas Dallas 0.895
#> 20 USA Texas Dallas 1.05
#> 21 Canada British Columbia Vancouver 0.914
#> 22 Canada Texas Vancouver 0.963
#> 23 Canada British Columbia Vancouver 0.979
#> 24 Canada British Columbia Vancouver 0.873
#> 25 Canada British Columbia Vancouver 0.746
#> 26 Canada British Columbia San Francisco 0.923
#> 27 Canada British Columbia Whistler 1.10
#> 28 Canada Quebec Whistler 0.964
#> 29 Canada British Columbia Whistler 1.00
#> 30 Canada British Columbia Whistler 0.745
#> 31 Canada California Montreal 0.960
#> 32 Canada Quebec Montreal 1.23
#> 33 Canada Quebec Los Angeles 0.995
#> 34 Canada Quebec Montreal 1.18
#> 35 Canada Quebec Montreal 0.965
#> 36 Canada Quebec Gatineau 1.22
#> 37 Canada Texas Austin 0.892
#> 38 Canada Quebec Gatineau 0.807
#> 39 Canada Quebec Gatineau 1.08
#> 40 Canada Quebec Gatineau 0.803
clean_hiearchy(data_messy, c(country, state, city), wt = weights)
#> # A tibble: 8 x 3
#> country state city
#> <chr> <chr> <chr>
#> 1 Canada British Columbia Vancouver
#> 2 Canada British Columbia Whistler
#> 3 USA California Los Angeles
#> 4 USA California San Francisco
#> 5 Canada Quebec Gatineau
#> 6 Canada Quebec Montreal
#> 7 USA Texas Austin
#> 8 USA Texas DallasCreated on 2021-11-10 by the reprex package (v0.2.1)