Last active
November 4, 2023 16:31
-
-
Save jmclawson/79e6994d491b3572d3a31e1f6a29ffc2 to your computer and use it in GitHub Desktop.
Applies tidytext's unnest_tokens() function but also filters out any word that appears in the text only with a capital letter. In English texts, this should be a quick way to remove all proper nouns.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unnest_without_caps <- function( | |
df, | |
column = "text") { | |
full <- df |> | |
tidytext::unnest_tokens(word, {{column}}, to_lower = FALSE) | |
big <- full |> | |
dplyr::filter(str_detect(word, "^[A-Z]")) |> | |
dplyr::pull(word) | |
small <- full |> | |
dplyr::filter(str_detect(word, "^[a-z]")) |> | |
dplyr::pull(word) | |
only_caps <- base::setdiff(tolower(big), small) | |
df |> | |
tidytext::unnest_tokens(word, {{column}}) |> | |
dplyr::filter(!word %in% only_caps) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment