kyoto-cheng · June 25, 2021 14:55
diff --git a/nlp_process.py b/nlp_process.py
 import re
 import pandas as pd
 import nltk
 from nltk import word_tokenize

 nltk.download('wordnet')
 nltk.download('punkt')

 lemma = nltk.wordnet.WordNetLemmatizer()

 # Import stop words from the txt file
 text_file = open("common_words.txt", "r")
 common_words = text_file.read().split('\n')

 # Choose top 500 words as stop words
 stop_words = common_words[:500]

 # NLP cleaning function 
 def nlp_process(text):
    # Case lowering 
    text = text.lower()
    # Replacing all special characters including numbers
    text = re.sub('[^A-Za-z]+',' ', text)
    # Tokenizing text into word list
    text = nltk.word_tokenize(text)
    # Removing stop words
    text = [word for word in text if word not in stop_words]
    # WordNet lemmatizing each tokenized word 
    text = [lemma.lemmatize(word) for word in text]
    # Joining words together 
    text = ' '.join(text)
    
    return text
	import re
	import pandas as pd
	import nltk
	from nltk import word_tokenize

	nltk.download('wordnet')
	nltk.download('punkt')

	lemma = nltk.wordnet.WordNetLemmatizer()

	# Import stop words from the txt file
	text_file = open("common_words.txt", "r")
	common_words = text_file.read().split('\n')

	# Choose top 500 words as stop words
	stop_words = common_words[:500]

	# NLP cleaning function
	def nlp_process(text):
	# Case lowering
	text = text.lower()
	# Replacing all special characters including numbers
	text = re.sub('[^A-Za-z]+',' ', text)
	# Tokenizing text into word list
	text = nltk.word_tokenize(text)
	# Removing stop words
	text = [word for word in text if word not in stop_words]
	# WordNet lemmatizing each tokenized word
	text = [lemma.lemmatize(word) for word in text]
	# Joining words together
	text = ' '.join(text)

	return text
No results found