DanielDaCosta · May 6, 2020 02:01
diff --git a/text_preprocessing.py b/text_preprocessing.py
 import re
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem.wordnet import WordNetLemmatizer
 def clean_text(text):
    text = text.lower()
    
    # '@' mention. Even tough @ adds some information to the message, 
    # this information doesn't add value build the classifcation model
    text = re.sub(r'@[A-Za-z0-9_]+','', text)
    
    # Dealing with URL links
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_regex,'urlplaceholder', text)
    # A lot of url are write as follows: http bit.ly. Apply Regex for these cases
    utl_regex_2 = 'http [a-zA-Z]+\.[a-zA-Z]+'
    text = re.sub(utl_regex_2,'urlplaceholder', text)
    # Other formats: http : //t.co/ihW64e8Z
    utl_regex_3 = 'http \: //[a-zA-Z]\.(co|com|pt|ly)/[A-Za-z0-9_]+'
    text = re.sub(utl_regex_3,'urlplaceholder', text)
    
    # Hashtags can provide useful informations. Removing only ``#``
    text = re.sub('#',' ', text)
    
    # Contractions
    text = re.sub(r"what's", 'what is ', text)
    text = re.sub(r"can't", 'cannot', text)
    text = re.sub(r"\'s",' ', text)
    text = re.sub(r"\'ve", ' have ', text)
    text = re.sub(r"n't", ' not ', text)
    text = re.sub(r"im", 'i am ', text)
    text = re.sub(r"i'm", 'i am ', text)
    text = re.sub(r"\'re", ' are ', text)
    text = re.sub(r"\'d", ' would ', text)
    text = re.sub(r"\'ll", ' will ', text)
                  
    # Operations and special words           
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub('foof', 'food', text)
    text = re.sub('msg', 'message', text)
    text = re.sub(' u ', 'you', text)
    
    # Ponctuation Removal
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    text = text.split()
    stop_words = stopwords.words("english")
    text = [tok for tok in text if tok not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(w) for w in text]
    return ' '.join(text)
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem.wordnet import WordNetLemmatizer
	def clean_text(text):
	text = text.lower()

	# '@' mention. Even tough @ adds some information to the message,
	# this information doesn't add value build the classifcation model
	text = re.sub(r'@[A-Za-z0-9_]+','', text)

	# Dealing with URL links
	url_regex = 'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
	text = re.sub(url_regex,'urlplaceholder', text)
	# A lot of url are write as follows: http bit.ly. Apply Regex for these cases
	utl_regex_2 = 'http [a-zA-Z]+\.[a-zA-Z]+'
	text = re.sub(utl_regex_2,'urlplaceholder', text)
	# Other formats: http : //t.co/ihW64e8Z
	utl_regex_3 = 'http \: //[a-zA-Z]\.(co\|com\|pt\|ly)/[A-Za-z0-9_]+'
	text = re.sub(utl_regex_3,'urlplaceholder', text)

	# Hashtags can provide useful informations. Removing only ``#``
	text = re.sub('#',' ', text)

	# Contractions
	text = re.sub(r"what's", 'what is ', text)
	text = re.sub(r"can't", 'cannot', text)
	text = re.sub(r"\'s",' ', text)
	text = re.sub(r"\'ve", ' have ', text)
	text = re.sub(r"n't", ' not ', text)
	text = re.sub(r"im", 'i am ', text)
	text = re.sub(r"i'm", 'i am ', text)
	text = re.sub(r"\'re", ' are ', text)
	text = re.sub(r"\'d", ' would ', text)
	text = re.sub(r"\'ll", ' will ', text)

	# Operations and special words
	text = re.sub(r",", " ", text)
	text = re.sub(r"\.", " ", text)
	text = re.sub(r"!", " ! ", text)
	text = re.sub(r"\/", " ", text)
	text = re.sub(r"\^", " ^ ", text)
	text = re.sub(r"\+", " + ", text)
	text = re.sub(r"\-", " - ", text)
	text = re.sub(r"\=", " = ", text)
	text = re.sub('foof', 'food', text)
	text = re.sub('msg', 'message', text)
	text = re.sub(' u ', 'you', text)

	# Ponctuation Removal
	text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

	text = text.split()
	stop_words = stopwords.words("english")
	text = [tok for tok in text if tok not in stop_words]
	lemmatizer = WordNetLemmatizer()
	text = [lemmatizer.lemmatize(w) for w in text]
	return ' '.join(text)