Skip to content

Instantly share code, notes, and snippets.

@DanielDaCosta
Last active May 6, 2020 02:01
Show Gist options
  • Save DanielDaCosta/1a3d2b14d2d76afe320f0ea5b922da0a to your computer and use it in GitHub Desktop.
Save DanielDaCosta/1a3d2b14d2d76afe320f0ea5b922da0a to your computer and use it in GitHub Desktop.
Text Preprocessing of MultiLabel Classifier
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
def clean_text(text):
text = text.lower()
# '@' mention. Even tough @ adds some information to the message,
# this information doesn't add value build the classifcation model
text = re.sub(r'@[A-Za-z0-9_]+','', text)
# Dealing with URL links
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
text = re.sub(url_regex,'urlplaceholder', text)
# A lot of url are write as follows: http bit.ly. Apply Regex for these cases
utl_regex_2 = 'http [a-zA-Z]+\.[a-zA-Z]+'
text = re.sub(utl_regex_2,'urlplaceholder', text)
# Other formats: http : //t.co/ihW64e8Z
utl_regex_3 = 'http \: //[a-zA-Z]\.(co|com|pt|ly)/[A-Za-z0-9_]+'
text = re.sub(utl_regex_3,'urlplaceholder', text)
# Hashtags can provide useful informations. Removing only ``#``
text = re.sub('#',' ', text)
# Contractions
text = re.sub(r"what's", 'what is ', text)
text = re.sub(r"can't", 'cannot', text)
text = re.sub(r"\'s",' ', text)
text = re.sub(r"\'ve", ' have ', text)
text = re.sub(r"n't", ' not ', text)
text = re.sub(r"im", 'i am ', text)
text = re.sub(r"i'm", 'i am ', text)
text = re.sub(r"\'re", ' are ', text)
text = re.sub(r"\'d", ' would ', text)
text = re.sub(r"\'ll", ' will ', text)
# Operations and special words
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub('foof', 'food', text)
text = re.sub('msg', 'message', text)
text = re.sub(' u ', 'you', text)
# Ponctuation Removal
text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
text = text.split()
stop_words = stopwords.words("english")
text = [tok for tok in text if tok not in stop_words]
lemmatizer = WordNetLemmatizer()
text = [lemmatizer.lemmatize(w) for w in text]
return ' '.join(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment