Skip to content

Instantly share code, notes, and snippets.

@kyoto-cheng
Last active June 25, 2021 14:55
Show Gist options
  • Save kyoto-cheng/39e3ac352e3822ffcfdea345265129e4 to your computer and use it in GitHub Desktop.
Save kyoto-cheng/39e3ac352e3822ffcfdea345265129e4 to your computer and use it in GitHub Desktop.
import re
import pandas as pd
import nltk
from nltk import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
lemma = nltk.wordnet.WordNetLemmatizer()
# Import stop words from the txt file
text_file = open("common_words.txt", "r")
common_words = text_file.read().split('\n')
# Choose top 500 words as stop words
stop_words = common_words[:500]
# NLP cleaning function
def nlp_process(text):
# Case lowering
text = text.lower()
# Replacing all special characters including numbers
text = re.sub('[^A-Za-z]+',' ', text)
# Tokenizing text into word list
text = nltk.word_tokenize(text)
# Removing stop words
text = [word for word in text if word not in stop_words]
# WordNet lemmatizing each tokenized word
text = [lemma.lemmatize(word) for word in text]
# Joining words together
text = ' '.join(text)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment