tidyr::separate()
cannot handle the cell that contains arbitrary order of values.
separate_rows()
+ pivot_wider()
works to some extent, but this cannot handle an empty string or a missing value.
tibble(
id = 1:4,
answer = c("a,b", "a,c", "a,b,c", "c")
) %>%
# separate() だとできないのでいったん縦向きに広げてから pivot する
separate_rows(answer) %>%
# pivot するのに適当な値が必要
dplyr::mutate(t = TRUE) %>%
pivot_wider(names_from = answer, values_from = t, values_fill = FALSE)
#> # A tibble: 4 x 4
#> id a b c
#> <int> <lgl> <lgl> <lgl>
#> 1 1 TRUE TRUE FALSE
#> 2 2 TRUE FALSE TRUE
#> 3 3 TRUE TRUE TRUE
#> 4 4 FALSE FALSE TRUE
This function is for the case.
library(dplyr, warn.conflicts = FALSE)
separate_cols <- function(x, sep = ",") {
# str_split() returns a list of split character vectors
s <- stringr::str_split(x, sep)
all_cols <- unique(purrr::flatten_chr(s))
all_cols <- all_cols[!is.na(all_cols) & all_cols != ""]
tmpl <- tibble::tibble(.rows = 1L)
tmpl[1L, all_cols] <- FALSE
s <- s %>%
purrr::map(
~ {
if (identical(.x, "") || is.na(.x)) {
tmpl
} else {
tmpl[1L, .x] <- TRUE
tmpl
}
}
)
dplyr::bind_rows(s)
}
separate_cols(c("a,b", "a,c", "a,b,c", "", NA))
#> # A tibble: 5 x 3
#> a b c
#> <lgl> <lgl> <lgl>
#> 1 TRUE TRUE FALSE
#> 2 TRUE FALSE TRUE
#> 3 TRUE TRUE TRUE
#> 4 FALSE FALSE FALSE
#> 5 FALSE FALSE FALSE
tibble(
id = 1:5,
answer = c("a,b", "a,c", "a,b,c", "", NA)
) %>%
mutate(separate_cols(answer), .keep = "unused")
#> # A tibble: 5 x 4
#> id a b c
#> <int> <lgl> <lgl> <lgl>
#> 1 1 TRUE TRUE FALSE
#> 2 2 TRUE FALSE TRUE
#> 3 3 TRUE TRUE TRUE
#> 4 4 FALSE FALSE FALSE
#> 5 5 FALSE FALSE FALSE