EricRohlfs · September 19, 2021 17:00
diff --git a/rlang_text_corpus_helpers b/rlang_text_corpus_helpers


 pivotDFM <- function(dfm){
  dfm_dt <- dfm %>%
              convert(to ="data.frame") %>%
              setDT(keep.rownames = c("docid","doc_id")) #convert to data.table
  
  pivoted <- melt(dfm_dt, id.vars = "docid") %>%
              mutate(document = if_else( docid == 1, .$value[1], .$value[2])) %>%
              setcolorder(c("docid","document","variable","value")) %>%
              setnames(c("docid", "document", "variable", "value"), c("docid", "document", "word", "occurance_of_word"))
  # todo: get the number of documents used and after the pivot, delete those entries, those are old column headers, it is confusing for the user of this list.
 
  return(pivoted)
 }

 remove_from_char_vector <- function(char_vector, search_pattern){
  #search_pattern needs to be an array c('@', 'http')
  cv <- char_vector
  for (sp in search_pattern) {
    print(paste0("removing the following entries from the character list using search pattern :", sp))
    print(cv[str_detect(cv,pattern= sp )])
    cv <- cv[!str_detect(cv,pattern= sp )]
  }
  return(cv)
 }


 dashed_words_dfm <- function(sdp_text, pws_text){
  
  dashed_words_regex <- "(?=\\S*[-])([a-zA-Z0-9'-]+)"
  sdp_dashed_words <- str_extract_all(sdp_text, dashed_words_regex)
  
  pws_dashed_words <- str_extract_all(pws_text, dashed_words_regex)
  
  dashed_words_dfm <- corpus(c(pws_docx= toString(pws_dashed_words), sdp_docx = toString(sdp_dashed_words) )) %>%
    tokens( ) %>%
    dfm()
  return(dashed_words_dfm)
 }


	pivotDFM <- function(dfm){
	dfm_dt <- dfm %>%
	convert(to ="data.frame") %>%
	setDT(keep.rownames = c("docid","doc_id")) #convert to data.table

	pivoted <- melt(dfm_dt, id.vars = "docid") %>%
	mutate(document = if_else( docid == 1, .$value[1], .$value[2])) %>%
	setcolorder(c("docid","document","variable","value")) %>%
	setnames(c("docid", "document", "variable", "value"), c("docid", "document", "word", "occurance_of_word"))
	# todo: get the number of documents used and after the pivot, delete those entries, those are old column headers, it is confusing for the user of this list.

	return(pivoted)
	}

	remove_from_char_vector <- function(char_vector, search_pattern){
	#search_pattern needs to be an array c('@', 'http')
	cv <- char_vector
	for (sp in search_pattern) {
	print(paste0("removing the following entries from the character list using search pattern :", sp))
	print(cv[str_detect(cv,pattern= sp )])
	cv <- cv[!str_detect(cv,pattern= sp )]
	}
	return(cv)
	}


	dashed_words_dfm <- function(sdp_text, pws_text){

	dashed_words_regex <- "(?=\\S*[-])([a-zA-Z0-9'-]+)"
	sdp_dashed_words <- str_extract_all(sdp_text, dashed_words_regex)

	pws_dashed_words <- str_extract_all(pws_text, dashed_words_regex)

	dashed_words_dfm <- corpus(c(pws_docx= toString(pws_dashed_words), sdp_docx = toString(sdp_dashed_words) )) %>%
	tokens( ) %>%
	dfm()
	return(dashed_words_dfm)
	}