-
-
Save geotheory/68e7958b2ca96774ed8203e62e4cb7c1 to your computer and use it in GitHub Desktop.
Recursive Split: Split a dataframe into a nested list and reassemble back into a dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(purrr) | |
library(rrapply) | |
# Split a data frame into a nested list using a different column for each level. | |
# This uses rrapply::rrapply() to avoid having to do any nested loops (map, lapply, | |
# for loop, whatever). | |
# Nested lists can be useful for avoiding searching through your data: the data | |
# has already been indexed in the list. This can be pretty handy for saving time | |
# if you need to do lots of filtering stuff. | |
# You can index nested lists in a number of ways: | |
# imaginary_list$level_1$level_2$level_3 | |
# imaginary_list[["level_1"]][["level_2"]][["level_3"]] | |
# imaginary_list[[c("level_1", "level_2", "level_3")]] | |
# Functions -------------------------------------------------------------------- | |
# Just the usual split function from base R, but takes the split factor from | |
# a column in the dataframe | |
split_function <- function(x, column){ | |
split(x, f = x[[column]]) | |
} | |
# rrapply will apply our split function to our list no matter how nested it is. | |
# this means we don't have to mess around with nested loops. | |
# Works like base rapply but is way more flexible | |
recursive_split <- function(data, column){ | |
rrapply( | |
data, | |
f = split_function, | |
dfaslist = FALSE, | |
how = "replace", | |
column = column | |
) | |
} | |
# apply the recursive split function for each column. | |
# have to put the data in a list at first to avoid treating the data.frame as a | |
# list. the last step subsets the answer to cancel out this initial list-ing. | |
# slightly sneaky use of .init argument to make it clear that argument 1 and 2 | |
# are different. we're essentially doing: | |
# f(data, column_1) %>% f(column_2) %>% f(column_3) .... | |
apply_recursive_split <- function(data, columns){ | |
data_list <- list(data) | |
result <- reduce( | |
.x = columns, | |
.f = recursive_split, | |
.init = data_list | |
) | |
result[[1]] | |
} | |
# rebind the data into a data.frame | |
rebind <- function(data){ | |
# we can flatten our nested list into one list of data.frames and bind together | |
flat_data_list <- rrapply( | |
data, | |
f = identity, | |
dfaslist = FALSE, | |
how = "flatten" | |
) | |
# go along the list binding everything up | |
rebound_data <- reduce(flat_data_list, rbind) | |
# the original split keeps row names which we can use to reorder and be | |
# identical to the original data.frame | |
# pretty sneaky. won't work with tibbles I imagine | |
order <- as.character(1:nrow(rebound_data)) | |
rebound_data[order, ] | |
} | |
# Example ---------------------------------------------------------------------- | |
size <- 500 | |
data <- data.frame( | |
x = sample(letters[1:3], size = size, replace = TRUE), | |
y = sample(letters[24:26], size = size, replace = TRUE), | |
z = sample(letters[13:14], size = size, replace = TRUE), | |
a = sample(letters[9:12], size = size, replace = TRUE), | |
value1 = rnorm(n = size), | |
value2 = rnorm(n = size, mean = 10), | |
stringsAsFactors = FALSE | |
) | |
split_columns <- c("x", "y", "z", "a") | |
# make a big recursive list with all our data in | |
data_list <- apply_recursive_split(data, split_columns) | |
# you can do some fun things here to index it: | |
data_list[[c("a", "x", "m", "l")]] | |
# rebind the data together into a data.frame | |
rebound_data <- rebind(data_list) | |
identical(rebound_data, data) | |
# Tibbles ---------------------------------------------------------------------- | |
library(tibble) | |
# with tibbles we lose order, but the data is still the same | |
data_tibble <- tibble( | |
x = sample(letters[1:3], size = size, replace = TRUE), | |
y = sample(letters[24:26], size = size, replace = TRUE), | |
z = sample(letters[13:14], size = size, replace = TRUE), | |
a = sample(letters[9:12], size = size, replace = TRUE), | |
value1 = rnorm(n = size), | |
value2 = rnorm(n = size, mean = 10) | |
) | |
rebound_tibble <- apply_recursive_split(data_tibble, split_columns) %>% | |
rebind() | |
# tibbles have no row names, so we lose order! | |
identical(data_tibble, rebound_tibble) | |
# the data is still the same though | |
identical( | |
arrange(rebound_tibble, x, y, z, a, value1, value2), | |
arrange(data_tibble, x, y, z, a, value1, value2) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment