Last active
May 6, 2020 02:01
-
-
Save DanielDaCosta/1a3d2b14d2d76afe320f0ea5b922da0a to your computer and use it in GitHub Desktop.
Text Preprocessing of MultiLabel Classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem.wordnet import WordNetLemmatizer | |
def clean_text(text): | |
text = text.lower() | |
# '@' mention. Even tough @ adds some information to the message, | |
# this information doesn't add value build the classifcation model | |
text = re.sub(r'@[A-Za-z0-9_]+','', text) | |
# Dealing with URL links | |
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' | |
text = re.sub(url_regex,'urlplaceholder', text) | |
# A lot of url are write as follows: http bit.ly. Apply Regex for these cases | |
utl_regex_2 = 'http [a-zA-Z]+\.[a-zA-Z]+' | |
text = re.sub(utl_regex_2,'urlplaceholder', text) | |
# Other formats: http : //t.co/ihW64e8Z | |
utl_regex_3 = 'http \: //[a-zA-Z]\.(co|com|pt|ly)/[A-Za-z0-9_]+' | |
text = re.sub(utl_regex_3,'urlplaceholder', text) | |
# Hashtags can provide useful informations. Removing only ``#`` | |
text = re.sub('#',' ', text) | |
# Contractions | |
text = re.sub(r"what's", 'what is ', text) | |
text = re.sub(r"can't", 'cannot', text) | |
text = re.sub(r"\'s",' ', text) | |
text = re.sub(r"\'ve", ' have ', text) | |
text = re.sub(r"n't", ' not ', text) | |
text = re.sub(r"im", 'i am ', text) | |
text = re.sub(r"i'm", 'i am ', text) | |
text = re.sub(r"\'re", ' are ', text) | |
text = re.sub(r"\'d", ' would ', text) | |
text = re.sub(r"\'ll", ' will ', text) | |
# Operations and special words | |
text = re.sub(r",", " ", text) | |
text = re.sub(r"\.", " ", text) | |
text = re.sub(r"!", " ! ", text) | |
text = re.sub(r"\/", " ", text) | |
text = re.sub(r"\^", " ^ ", text) | |
text = re.sub(r"\+", " + ", text) | |
text = re.sub(r"\-", " - ", text) | |
text = re.sub(r"\=", " = ", text) | |
text = re.sub('foof', 'food', text) | |
text = re.sub('msg', 'message', text) | |
text = re.sub(' u ', 'you', text) | |
# Ponctuation Removal | |
text = re.sub(r'[^a-zA-Z0-9]', ' ', text) | |
text = text.split() | |
stop_words = stopwords.words("english") | |
text = [tok for tok in text if tok not in stop_words] | |
lemmatizer = WordNetLemmatizer() | |
text = [lemmatizer.lemmatize(w) for w in text] | |
return ' '.join(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment