Created
September 19, 2021 17:00
-
-
Save EricRohlfs/6ba24ca17f266a548482da27cfe96a98 to your computer and use it in GitHub Desktop.
corpus helpers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pivotDFM <- function(dfm){ | |
dfm_dt <- dfm %>% | |
convert(to ="data.frame") %>% | |
setDT(keep.rownames = c("docid","doc_id")) #convert to data.table | |
pivoted <- melt(dfm_dt, id.vars = "docid") %>% | |
mutate(document = if_else( docid == 1, .$value[1], .$value[2])) %>% | |
setcolorder(c("docid","document","variable","value")) %>% | |
setnames(c("docid", "document", "variable", "value"), c("docid", "document", "word", "occurance_of_word")) | |
# todo: get the number of documents used and after the pivot, delete those entries, those are old column headers, it is confusing for the user of this list. | |
return(pivoted) | |
} | |
remove_from_char_vector <- function(char_vector, search_pattern){ | |
#search_pattern needs to be an array c('@', 'http') | |
cv <- char_vector | |
for (sp in search_pattern) { | |
print(paste0("removing the following entries from the character list using search pattern :", sp)) | |
print(cv[str_detect(cv,pattern= sp )]) | |
cv <- cv[!str_detect(cv,pattern= sp )] | |
} | |
return(cv) | |
} | |
dashed_words_dfm <- function(sdp_text, pws_text){ | |
dashed_words_regex <- "(?=\\S*[-])([a-zA-Z0-9'-]+)" | |
sdp_dashed_words <- str_extract_all(sdp_text, dashed_words_regex) | |
pws_dashed_words <- str_extract_all(pws_text, dashed_words_regex) | |
dashed_words_dfm <- corpus(c(pws_docx= toString(pws_dashed_words), sdp_docx = toString(sdp_dashed_words) )) %>% | |
tokens( ) %>% | |
dfm() | |
return(dashed_words_dfm) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment