Skip to content

Instantly share code, notes, and snippets.

@john-adeojo
Created March 21, 2023 14:25
Show Gist options
  • Save john-adeojo/a6b003a3ce307b179207a547a4bd62ce to your computer and use it in GitHub Desktop.
Save john-adeojo/a6b003a3ce307b179207a547a4bd62ce to your computer and use it in GitHub Desktop.
import re
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
class TextCleaner:
def __init__(self, stop_words=None, stop_words_remove=False):
self.stop_words_remove = stop_words_remove
if stop_words:
self.stop_words = stop_words
else:
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def clean_text(self, text):
# convert input to str
text = str(text)
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove stop words
if self.stop_words_remove == True:
tokens = [token for token in tokens if token.lower() not in self.stop_words]
cleaned_text = text
return cleaned_text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment