Skip to content

Instantly share code, notes, and snippets.

@Davisy
Created June 25, 2021 08:05
Show Gist options
  • Save Davisy/5f6f73ad113389b82eaf35f2ed1edb84 to your computer and use it in GitHub Desktop.
Save Davisy/5f6f73ad113389b82eaf35f2ed1edb84 to your computer and use it in GitHub Desktop.
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
# Clean the text, with the option to remove stop_words and to lemmatize word
# Clean the text
text = re.sub(r"[^A-Za-z0-9]", " ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"http\S+", " link ", text)
text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text) # remove numbers
# Remove punctuation from text
text = "".join([c for c in text if c not in punctuation])
# Optionally, remove stop words
if remove_stop_words:
# load stopwords
stop_words = stopwords.words("english")
text = text.split()
text = [w for w in text if not w in stop_words]
text = " ".join(text)
# Optionally, shorten words to their stems
if lemmatize_words:
text = text.split()
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
text = " ".join(lemmatized_words)
# Return a list of words
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment