jonathanoheix · December 18, 2018 09:49
diff --git a/nlp4.py b/nlp4.py
 # return the wordnet object value corresponding to the POS tag
 from nltk.corpus import wordnet

 def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
 import string
 from nltk import pos_tag
 from nltk.corpus import stopwords
 from nltk.tokenize import WhitespaceTokenizer
 from nltk.stem import WordNetLemmatizer

 def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

 # clean text data
 reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(x))
	# return the wordnet object value corresponding to the POS tag
	from nltk.corpus import wordnet

	def get_wordnet_pos(pos_tag):
	if pos_tag.startswith('J'):
	return wordnet.ADJ
	elif pos_tag.startswith('V'):
	return wordnet.VERB
	elif pos_tag.startswith('N'):
	return wordnet.NOUN
	elif pos_tag.startswith('R'):
	return wordnet.ADV
	else:
	return wordnet.NOUN

	import string
	from nltk import pos_tag
	from nltk.corpus import stopwords
	from nltk.tokenize import WhitespaceTokenizer
	from nltk.stem import WordNetLemmatizer

	def clean_text(text):
	# lower text
	text = text.lower()
	# tokenize text and remove puncutation
	text = [word.strip(string.punctuation) for word in text.split(" ")]
	# remove words that contain numbers
	text = [word for word in text if not any(c.isdigit() for c in word)]
	# remove stop words
	stop = stopwords.words('english')
	text = [x for x in text if x not in stop]
	# remove empty tokens
	text = [t for t in text if len(t) > 0]
	# pos tag text
	pos_tags = pos_tag(text)
	# lemmatize text
	text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
	# remove words with only one letter
	text = [t for t in text if len(t) > 1]
	# join all
	text = " ".join(text)
	return(text)

	# clean text data
	reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(x))