geotheory · November 1, 2020 01:36
diff --git a/recursive_split b/recursive_split
 library(purrr)
 library(rrapply)

 # Split a data frame into a nested list using a different column for each level.
 # This uses rrapply::rrapply() to avoid having to do any nested loops (map, lapply,
 # for loop, whatever).

 # Nested lists can be useful for avoiding searching through your data: the data
 # has already been indexed in the list. This can be pretty handy for saving time 
 # if you need to do lots of filtering stuff. 
 # You can index nested lists in a number of ways:
 # imaginary_list$level_1$level_2$level_3
 # imaginary_list[["level_1"]][["level_2"]][["level_3"]]
 # imaginary_list[[c("level_1", "level_2", "level_3")]]

 # Functions --------------------------------------------------------------------
 # Just the usual split function from base R, but takes the split factor from
 # a column in the dataframe
 split_function <- function(x, column){ 
  
  split(x, f = x[[column]]) 
  
 }

 # rrapply will apply our split function to our list no matter how nested it is.
 # this means we don't have to mess around with nested loops.
 # Works like base rapply but is way more flexible
 recursive_split <- function(data, column){
  rrapply(
    data,
    f = split_function,
    dfaslist = FALSE,
    how = "replace",
    column = column
  )
 }

 # apply the recursive split function for each column.
 # have to put the data in a list at first to avoid treating the data.frame as a
 # list. the last step subsets the answer to cancel out this initial list-ing.
 # slightly sneaky use of .init argument to make it clear that argument 1 and 2 
 # are different. we're essentially doing:
 # f(data, column_1) %>% f(column_2) %>% f(column_3) ....
 apply_recursive_split <- function(data, columns){
  
  data_list <- list(data)
  
  result <- reduce(
    .x = columns,
    .f = recursive_split,
    .init = data_list
  )
  
  result[[1]]
 }

 # rebind the data into a data.frame
 rebind <- function(data){
  
  # we can flatten our nested list into one list of data.frames and bind together
  flat_data_list <- rrapply(
    data,
    f = identity,
    dfaslist = FALSE,
    how = "flatten"
  )
  
  # go along the list binding everything up
  rebound_data <- reduce(flat_data_list, rbind)
  
  # the original split keeps row names which we can use to reorder and be 
  # identical to the original data.frame
  # pretty sneaky. won't work with tibbles I imagine
  order <- as.character(1:nrow(rebound_data))
  
  rebound_data[order, ]
  
 }

 # Example ----------------------------------------------------------------------
 size <- 500
 data <- data.frame(
  x = sample(letters[1:3], size = size, replace = TRUE),
  y = sample(letters[24:26], size = size, replace = TRUE),
  z = sample(letters[13:14], size = size, replace = TRUE),
  a = sample(letters[9:12], size = size, replace = TRUE),
  value1 = rnorm(n = size),
  value2 = rnorm(n = size, mean = 10),
  stringsAsFactors = FALSE
 )

 split_columns <- c("x", "y", "z", "a")

 # make a big recursive list with all our data in
 data_list <- apply_recursive_split(data, split_columns)

 # you can do some fun things here to index it:
 data_list[[c("a", "x", "m", "l")]]

 # rebind the data together into a data.frame
 rebound_data <- rebind(data_list)

 identical(rebound_data, data)


 # Tibbles ----------------------------------------------------------------------
 library(tibble)

 # with tibbles we lose order, but the data is still the same
 data_tibble <- tibble(
  x = sample(letters[1:3], size = size, replace = TRUE),
  y = sample(letters[24:26], size = size, replace = TRUE),
  z = sample(letters[13:14], size = size, replace = TRUE),
  a = sample(letters[9:12], size = size, replace = TRUE),
  value1 = rnorm(n = size),
  value2 = rnorm(n = size, mean = 10)
 )

 rebound_tibble <- apply_recursive_split(data_tibble, split_columns) %>%
  rebind()

 # tibbles have no row names, so we lose order!
 identical(data_tibble, rebound_tibble)

 # the data is still the same though
 identical(
  arrange(rebound_tibble, x, y, z, a, value1, value2),
  arrange(data_tibble, x, y, z, a, value1, value2)
 )
	library(purrr)
	library(rrapply)

	# Split a data frame into a nested list using a different column for each level.
	# This uses rrapply::rrapply() to avoid having to do any nested loops (map, lapply,
	# for loop, whatever).

	# Nested lists can be useful for avoiding searching through your data: the data
	# has already been indexed in the list. This can be pretty handy for saving time
	# if you need to do lots of filtering stuff.
	# You can index nested lists in a number of ways:
	# imaginary_list$level_1$level_2$level_3
	# imaginary_list[["level_1"]][["level_2"]][["level_3"]]
	# imaginary_list[[c("level_1", "level_2", "level_3")]]

	# Functions --------------------------------------------------------------------
	# Just the usual split function from base R, but takes the split factor from
	# a column in the dataframe
	split_function <- function(x, column){

	split(x, f = x[[column]])

	}

	# rrapply will apply our split function to our list no matter how nested it is.
	# this means we don't have to mess around with nested loops.
	# Works like base rapply but is way more flexible
	recursive_split <- function(data, column){
	rrapply(
	data,
	f = split_function,
	dfaslist = FALSE,
	how = "replace",
	column = column
	)
	}

	# apply the recursive split function for each column.
	# have to put the data in a list at first to avoid treating the data.frame as a
	# list. the last step subsets the answer to cancel out this initial list-ing.
	# slightly sneaky use of .init argument to make it clear that argument 1 and 2
	# are different. we're essentially doing:
	# f(data, column_1) %>% f(column_2) %>% f(column_3) ....
	apply_recursive_split <- function(data, columns){

	data_list <- list(data)

	result <- reduce(
	.x = columns,
	.f = recursive_split,
	.init = data_list
	)

	result[[1]]
	}

	# rebind the data into a data.frame
	rebind <- function(data){

	# we can flatten our nested list into one list of data.frames and bind together
	flat_data_list <- rrapply(
	data,
	f = identity,
	dfaslist = FALSE,
	how = "flatten"
	)

	# go along the list binding everything up
	rebound_data <- reduce(flat_data_list, rbind)

	# the original split keeps row names which we can use to reorder and be
	# identical to the original data.frame
	# pretty sneaky. won't work with tibbles I imagine
	order <- as.character(1:nrow(rebound_data))

	rebound_data[order, ]

	}

	# Example ----------------------------------------------------------------------
	size <- 500
	data <- data.frame(
	x = sample(letters[1:3], size = size, replace = TRUE),
	y = sample(letters[24:26], size = size, replace = TRUE),
	z = sample(letters[13:14], size = size, replace = TRUE),
	a = sample(letters[9:12], size = size, replace = TRUE),
	value1 = rnorm(n = size),
	value2 = rnorm(n = size, mean = 10),
	stringsAsFactors = FALSE
	)

	split_columns <- c("x", "y", "z", "a")

	# make a big recursive list with all our data in
	data_list <- apply_recursive_split(data, split_columns)

	# you can do some fun things here to index it:
	data_list[[c("a", "x", "m", "l")]]

	# rebind the data together into a data.frame
	rebound_data <- rebind(data_list)

	identical(rebound_data, data)


	# Tibbles ----------------------------------------------------------------------
	library(tibble)

	# with tibbles we lose order, but the data is still the same
	data_tibble <- tibble(
	x = sample(letters[1:3], size = size, replace = TRUE),
	y = sample(letters[24:26], size = size, replace = TRUE),
	z = sample(letters[13:14], size = size, replace = TRUE),
	a = sample(letters[9:12], size = size, replace = TRUE),
	value1 = rnorm(n = size),
	value2 = rnorm(n = size, mean = 10)
	)

	rebound_tibble <- apply_recursive_split(data_tibble, split_columns) %>%
	rebind()

	# tibbles have no row names, so we lose order!
	identical(data_tibble, rebound_tibble)

	# the data is still the same though
	identical(
	arrange(rebound_tibble, x, y, z, a, value1, value2),
	arrange(data_tibble, x, y, z, a, value1, value2)
	)