Created
June 25, 2021 08:05
-
-
Save Davisy/5f6f73ad113389b82eaf35f2ed1edb84 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True): | |
# Clean the text, with the option to remove stop_words and to lemmatize word | |
# Clean the text | |
text = re.sub(r"[^A-Za-z0-9]", " ", text) | |
text = re.sub(r"\'s", " ", text) | |
text = re.sub(r"http\S+", " link ", text) | |
text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text) # remove numbers | |
# Remove punctuation from text | |
text = "".join([c for c in text if c not in punctuation]) | |
# Optionally, remove stop words | |
if remove_stop_words: | |
# load stopwords | |
stop_words = stopwords.words("english") | |
text = text.split() | |
text = [w for w in text if not w in stop_words] | |
text = " ".join(text) | |
# Optionally, shorten words to their stems | |
if lemmatize_words: | |
text = text.split() | |
lemmatizer = WordNetLemmatizer() | |
lemmatized_words = [lemmatizer.lemmatize(word) for word in text] | |
text = " ".join(lemmatized_words) | |
# Return a list of words | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment