thibaut-d · August 24, 2021 21:22
diff --git a/clean_text_corpus.R b/clean_text_corpus.R
 library(tidyverse)
 library(textshape)
 library(lexicon)
 library(textclean)
 library(hunspell)
 library(qdapRegex)

 #' Detect and correct misspells in a string
 #' @param x string.
 #' @return corrected string
 #' @examples
 #' replace_misspells('I vrite anglish verry weell')
 #' replace_misspells('Romeo, Romeo! Wherefore art thou Romeo?')
 #' 
 replace_misspells = function(x){
  
  sapply(1:length(x),function(y){
    if (is.na(y) || !is.character(y)) return(y)
    bad = hunspell(x[y])[[1]]
    good = unlist(lapply(hunspell_suggest(bad),`[[`,1))
    
    if (length(bad)){
      for (i in 1:length(bad)){
        x[y] <<- gsub(bad[i],good[i],x[y])
      }}})
  return(x)
 }

 #' Take a list of strings with text as "x" and return a cleaned version that is more suited for NLP
 #' @param x list of strings
 #' @return cleaned strings
 #' @examples
 #' replace_misspells(df$text)
 #' replace_misspells(corpus)
 #' 
 clean_corpus = function(x){
  # Replace redundant white spaces and line jumps such as \n
  x = replace_white(x)
  # Replace or remove non ASCII characters
  x = replace_non_ascii(x)
  # Replace contractions such as "you're" by expanded such as "you are"
  x = replace_contraction(x)
  # Replace elongations. Ex: "heyyyyy" is replaced by "Hey"
  x = replace_word_elongation(x)
  # Replace emoji by plain text
  x = replace_emoji(x)
  # Same for emoticons
  x = replace_emoticon(x)
  # Get ride of HTML remaining in the text if any
  x = replace_html(x)
  # Normalize incomplete sentence replacement
  x = replace_incomplete(x, '.')
  # Replace internet slang by standard words
  x = replace_internet_slang(x)
  # Normalize spaces
  x = replace_kern(x)
  # Replace all amounts of money by a word
  x = replace_money(x, replacement = 'money')
  # Replace all names by a word
  x = replace_names(x, replacement = 'name')
  # Replace dates by a word
  x = replace_date(x, replacement = 'date')
  # Replace all times with a word
  x = replace_time(x, replacement = 'time')
  # Replace ordinals. For example 1st is transformed to first
  x = replace_ordinal(x)
  # Replace ratings such as "five stars" by more common adjectives
  x = replace_rating(x)
  # Replace all numbers by a word
  x = rm_number(x, replacement = "number")
  # Replace symbols used as abbreviations such as @ by at
  x = replace_symbol(x)
  # Strip remaining characters that are not useful
  x = strip(x, char.keep = c("?","!", ".",",",";",":","'"))
  # Replace misspelled words (disable it if the data is too large)
  x = replace_misspells(x)
  # ok, done...
  return(x)
 }
	library(tidyverse)
	library(textshape)
	library(lexicon)
	library(textclean)
	library(hunspell)
	library(qdapRegex)

	#' Detect and correct misspells in a string
	#' @param x string.
	#' @return corrected string
	#' @examples
	#' replace_misspells('I vrite anglish verry weell')
	#' replace_misspells('Romeo, Romeo! Wherefore art thou Romeo?')
	#'
	replace_misspells = function(x){

	sapply(1:length(x),function(y){
	if (is.na(y) \|\| !is.character(y)) return(y)
	bad = hunspell(x[y])[[1]]
	good = unlist(lapply(hunspell_suggest(bad),`[[`,1))

	if (length(bad)){
	for (i in 1:length(bad)){
	x[y] <<- gsub(bad[i],good[i],x[y])
	}}})
	return(x)
	}

	#' Take a list of strings with text as "x" and return a cleaned version that is more suited for NLP
	#' @param x list of strings
	#' @return cleaned strings
	#' @examples
	#' replace_misspells(df$text)
	#' replace_misspells(corpus)
	#'
	clean_corpus = function(x){
	# Replace redundant white spaces and line jumps such as \n
	x = replace_white(x)
	# Replace or remove non ASCII characters
	x = replace_non_ascii(x)
	# Replace contractions such as "you're" by expanded such as "you are"
	x = replace_contraction(x)
	# Replace elongations. Ex: "heyyyyy" is replaced by "Hey"
	x = replace_word_elongation(x)
	# Replace emoji by plain text
	x = replace_emoji(x)
	# Same for emoticons
	x = replace_emoticon(x)
	# Get ride of HTML remaining in the text if any
	x = replace_html(x)
	# Normalize incomplete sentence replacement
	x = replace_incomplete(x, '.')
	# Replace internet slang by standard words
	x = replace_internet_slang(x)
	# Normalize spaces
	x = replace_kern(x)
	# Replace all amounts of money by a word
	x = replace_money(x, replacement = 'money')
	# Replace all names by a word
	x = replace_names(x, replacement = 'name')
	# Replace dates by a word
	x = replace_date(x, replacement = 'date')
	# Replace all times with a word
	x = replace_time(x, replacement = 'time')
	# Replace ordinals. For example 1st is transformed to first
	x = replace_ordinal(x)
	# Replace ratings such as "five stars" by more common adjectives
	x = replace_rating(x)
	# Replace all numbers by a word
	x = rm_number(x, replacement = "number")
	# Replace symbols used as abbreviations such as @ by at
	x = replace_symbol(x)
	# Strip remaining characters that are not useful
	x = strip(x, char.keep = c("?","!", ".",",",";",":","'"))
	# Replace misspelled words (disable it if the data is too large)
	x = replace_misspells(x)
	# ok, done...
	return(x)
	}
No results found