Last active
June 25, 2021 14:55
-
-
Save kyoto-cheng/39e3ac352e3822ffcfdea345265129e4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import pandas as pd | |
import nltk | |
from nltk import word_tokenize | |
nltk.download('wordnet') | |
nltk.download('punkt') | |
lemma = nltk.wordnet.WordNetLemmatizer() | |
# Import stop words from the txt file | |
text_file = open("common_words.txt", "r") | |
common_words = text_file.read().split('\n') | |
# Choose top 500 words as stop words | |
stop_words = common_words[:500] | |
# NLP cleaning function | |
def nlp_process(text): | |
# Case lowering | |
text = text.lower() | |
# Replacing all special characters including numbers | |
text = re.sub('[^A-Za-z]+',' ', text) | |
# Tokenizing text into word list | |
text = nltk.word_tokenize(text) | |
# Removing stop words | |
text = [word for word in text if word not in stop_words] | |
# WordNet lemmatizing each tokenized word | |
text = [lemma.lemmatize(word) for word in text] | |
# Joining words together | |
text = ' '.join(text) | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment