Skip to content

Instantly share code, notes, and snippets.

@RodriguezGoldstein
Created August 8, 2018 04:14
Show Gist options
  • Save RodriguezGoldstein/c9801f7545bc9008d95bd1a353fefa94 to your computer and use it in GitHub Desktop.
Save RodriguezGoldstein/c9801f7545bc9008d95bd1a353fefa94 to your computer and use it in GitHub Desktop.
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
replace_reg <- '(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]|(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"'
#type_visited = "Visited"
#type_searched = "Searched"
#search_data[search_data$type == type_visited, ]$search
search <- search_data$search %>%
str_replace_all(pattern = replace_reg, replacement = " ") %>%
iconv(from = "ASCII", to = "UTF-8", sub = " ") %>%
tolower() %>%
trimws()
search <- tibble(text = search) %>%
unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
filter(!word %in% stop_words$word, str_detect(word, "[a-z]"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment