Regular expression buffs, I need your help with this.
I have a string of the form: s <- "Variable Name (Shortcut) (Variable Unit)".
I need to extract two things: a) Everything BEFORE (Variable Unit); b) Variable Unit from inside the last set of parantheses. How? #rstats
https://twitter.com/IsabellaGhement/status/1597653720841736194?s=20&t=ICQ5FEcBplmVPZIVT0hxOQ
library(dplyr)
library(tidyr)
library(stringr)
elements <- c("Total Zinc (T) (mu/g)", "PH (%)", "Total Iron (mu/g)")
# We're going to put this in a tibble so we can take advantage of dplyr.
elements_tbl <- tibble(element = elements)
# str_split() approach --------------------------------------------------------
# This approach minimizes the amount of regex you have to write, and lets dplyr
# do the heavy lifting. It's verbose but quicker to write and easier to
# understand.
# These next two lines are just so we can get nicer column names when we
# unnest_wider.
element_section_names <- c("name", "shortcut", "unit")
elements_column_names <- c(
setdiff(names(elements_tbl), "strings"), element_section_names
)
# The element strings don't have any indicator of what is a unit versus
# shorthand, so we need to specify this ourselves to be able to distinguish
# them.
element_units <- c("(%)", "(mu/g)")
elements_tbl %>%
# This creates a list column which we can unnest to get one column for each
# part of the string.
mutate(strings = str_split(element, " (?=\\()")) %>%
unnest_wider(strings, names_repair = ~ elements_column_names) %>%
# Then we can wrangle the new columns containing the three parts of our
# string to get the desired result.
mutate(
unit = case_when(
shortcut %in% element_units ~ shortcut,
TRUE ~ unit
),
shortcut = case_when(
shortcut %in% element_units ~ NA_character_,
TRUE ~ shortcut
),
name_shortcut = case_when(
!is.na(shortcut) ~ paste(name, shortcut),
is.na(shortcut) ~ name
)
)
# str_extract() approach ------------------------------------------------------
# This approach minimizes the amount of data wrangling you have to do, and uses
# regex for the heavy lifting. It's compact but slower to write and harder to
# understand. Using stringr::str_view() helps while figuring out the patterns
# you need.
elements_tbl %>%
mutate(
name = str_extract(element, ".+?(?= \\()"),
shortcut = str_extract(element, "\\(.+\\)(?= \\()"),
unit = str_extract(element, "(\\([^\\(\\)]+\\))(?! )"),
name_shortcut = str_extract(element, ".+(?= \\()")
)
library(dplyr)
library(tidyr)
library(stringr)
elements <- c("Total Zinc (T) (mu/g)", "PH (%)", "Total Iron (mu/g)")
# We're going to put this in a tibble so we can take advantage of dplyr.
elements_tbl <- tibble(element = elements)
# str_split() approach --------------------------------------------------------
# This approach minimizes the amount of regex you have to write, and lets dplyr
# do the heavy lifting. It's verbose but quicker to write and easier to
# understand.
# These next two lines are just so we can get nicer column names when we
# unnest_wider.
element_section_names <- c("name", "shortcut", "unit")
elements_column_names <- c(
setdiff(names(elements_tbl), "strings"), element_section_names
)
# The element strings don't have any indicator of what is a unit versus
# shorthand, so we need to specify this ourselves to be able to distinguish
# them.
element_units <- c("(%)", "(mu/g)")
elements_tbl %>%
# This creates a list column which we can unnest to get one column for each
# part of the string.
mutate(strings = str_split(element, " (?=\\()")) %>%
unnest_wider(strings, names_repair = ~ elements_column_names) %>%
# Then we can wrangle the new columns containing the three parts of our
# string to get the desired result.
mutate(
unit = case_when(
shortcut %in% element_units ~ shortcut,
TRUE ~ unit
),
shortcut = case_when(
shortcut %in% element_units ~ NA_character_,
TRUE ~ shortcut
),
name_shortcut = case_when(
!is.na(shortcut) ~ paste(name, shortcut),
is.na(shortcut) ~ name
)
)
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
#> • `` -> `...3`
#> New names:
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
#> # A tibble: 3 × 5
#> element name shortcut unit name_shortcut
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Total Zinc (T) (mu/g) Total Zinc (T) (mu/g) Total Zinc (T)
#> 2 PH (%) PH <NA> (%) PH
#> 3 Total Iron (mu/g) Total Iron <NA> (mu/g) Total Iron
# str_extract() approach ------------------------------------------------------
# This approach minimizes the amount of data wrangling you have to do, and uses
# regex for the heavy lifting. It's compact but slower to write and harder to
# understand. Using stringr::str_view() helps while figuring out the patterns
# you need.
elements_tbl %>%
mutate(
name = str_extract(element, ".+?(?= \\()"),
shortcut = str_extract(element, "\\(.+\\)(?= \\()"),
unit = str_extract(element, "(\\([^\\(\\)]+\\))(?! )"),
name_shortcut = str_extract(element, ".+(?= \\()")
)
#> # A tibble: 3 × 5
#> element name shortcut unit name_shortcut
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Total Zinc (T) (mu/g) Total Zinc (T) (mu/g) Total Zinc (T)
#> 2 PH (%) PH <NA> (%) PH
#> 3 Total Iron (mu/g) Total Iron <NA> (mu/g) Total Iron
Created on 2022-11-29 by the reprex package (v2.0.1)