jeremy-allen · November 22, 2021 17:42
diff --git a/separate_and_long.R b/separate_and_long.R
 library(tidyverse)

 # This fake data matches real-world data I was given.
 # The level of analysis is the units of peppers (rows),
 # but we see subunits here, and need to make those the
 # level of analysis - one subunit per row, i.e., tidy.
 # Notice the description column is not named like the
 # other subunit columns.
 # This data matches the example described on the pivot_longer
 # help page as "Multiple observations per row."
 # A complication is that each Description does not 
 # contain the same number of subunits.

 peppers <- tibble(
  unit_id = c("000000001", "000000002", "000000003"),
  date = c("2021-01-01", "2021-01-02", "2021-01-01"),
  description = c("subunit 1 is great and subunit 2 is cool", "subunit 1 is awesome and subunit 2 is amazing", "subunit 1 is bad and subunit 2 is gross and subunit 3 is rotten"),
  su1_weight = c(100, 50, 150),
  su1_length = c(5, 2, 7),
  su2_weight = c(200, 150, 300),
  su2_length = c(12, 7, 24)
 )

 peppers

 # The description column can be separated into new columns,
 # and if we make names of the new columns match the format of
 # the other subunit column names we can take advantage of a certain
 # pivot_longer argument.

 peppers_sep <- peppers %>% 
  separate(
    description,
    into = c("su1_description", "su2_description", "su3_description"),
    sep = "\\sand\\s"
  )

 peppers_sep

 peppers_long <- peppers_sep %>% pivot_longer(
  cols = starts_with("su"), # now we can do because all subunit cols start this way now
  names_to = c("subunit", ".value"), # the special argument with .value!
  names_pattern = "(su\\d)_(\\w+)" # regex that matches existing col names
 )

 peppers_long

 # Because unit 000000003 had three subunits while the others
 # only two, we get descritpion rows with NA. So we drop those rows.
 peppers_complete <- peppers_long %>% 
  filter(!is.na(description))

 peppers_complete
	library(tidyverse)

	# This fake data matches real-world data I was given.
	# The level of analysis is the units of peppers (rows),
	# but we see subunits here, and need to make those the
	# level of analysis - one subunit per row, i.e., tidy.
	# Notice the description column is not named like the
	# other subunit columns.
	# This data matches the example described on the pivot_longer
	# help page as "Multiple observations per row."
	# A complication is that each Description does not
	# contain the same number of subunits.

	peppers <- tibble(
	unit_id = c("000000001", "000000002", "000000003"),
	date = c("2021-01-01", "2021-01-02", "2021-01-01"),
	description = c("subunit 1 is great and subunit 2 is cool", "subunit 1 is awesome and subunit 2 is amazing", "subunit 1 is bad and subunit 2 is gross and subunit 3 is rotten"),
	su1_weight = c(100, 50, 150),
	su1_length = c(5, 2, 7),
	su2_weight = c(200, 150, 300),
	su2_length = c(12, 7, 24)
	)

	peppers

	# The description column can be separated into new columns,
	# and if we make names of the new columns match the format of
	# the other subunit column names we can take advantage of a certain
	# pivot_longer argument.

	peppers_sep <- peppers %>%
	separate(
	description,
	into = c("su1_description", "su2_description", "su3_description"),
	sep = "\\sand\\s"
	)

	peppers_sep

	peppers_long <- peppers_sep %>% pivot_longer(
	cols = starts_with("su"), # now we can do because all subunit cols start this way now
	names_to = c("subunit", ".value"), # the special argument with .value!
	names_pattern = "(su\\d)_(\\w+)" # regex that matches existing col names
	)

	peppers_long

	# Because unit 000000003 had three subunits while the others
	# only two, we get descritpion rows with NA. So we drop those rows.
	peppers_complete <- peppers_long %>%
	filter(!is.na(description))

	peppers_complete