Skip to content

Instantly share code, notes, and snippets.

@EricRohlfs
Created September 19, 2021 17:00
Show Gist options
  • Save EricRohlfs/6ba24ca17f266a548482da27cfe96a98 to your computer and use it in GitHub Desktop.
Save EricRohlfs/6ba24ca17f266a548482da27cfe96a98 to your computer and use it in GitHub Desktop.
corpus helpers
pivotDFM <- function(dfm){
dfm_dt <- dfm %>%
convert(to ="data.frame") %>%
setDT(keep.rownames = c("docid","doc_id")) #convert to data.table
pivoted <- melt(dfm_dt, id.vars = "docid") %>%
mutate(document = if_else( docid == 1, .$value[1], .$value[2])) %>%
setcolorder(c("docid","document","variable","value")) %>%
setnames(c("docid", "document", "variable", "value"), c("docid", "document", "word", "occurance_of_word"))
# todo: get the number of documents used and after the pivot, delete those entries, those are old column headers, it is confusing for the user of this list.
return(pivoted)
}
remove_from_char_vector <- function(char_vector, search_pattern){
#search_pattern needs to be an array c('@', 'http')
cv <- char_vector
for (sp in search_pattern) {
print(paste0("removing the following entries from the character list using search pattern :", sp))
print(cv[str_detect(cv,pattern= sp )])
cv <- cv[!str_detect(cv,pattern= sp )]
}
return(cv)
}
dashed_words_dfm <- function(sdp_text, pws_text){
dashed_words_regex <- "(?=\\S*[-])([a-zA-Z0-9'-]+)"
sdp_dashed_words <- str_extract_all(sdp_text, dashed_words_regex)
pws_dashed_words <- str_extract_all(pws_text, dashed_words_regex)
dashed_words_dfm <- corpus(c(pws_docx= toString(pws_dashed_words), sdp_docx = toString(sdp_dashed_words) )) %>%
tokens( ) %>%
dfm()
return(dashed_words_dfm)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment