Skip to content

Instantly share code, notes, and snippets.

@jonathanoheix
Created December 18, 2018 09:49
Show Gist options
  • Save jonathanoheix/69a6bb1ad5dee29839f8849085106521 to your computer and use it in GitHub Desktop.
Save jonathanoheix/69a6bb1ad5dee29839f8849085106521 to your computer and use it in GitHub Desktop.
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
def clean_text(text):
# lower text
text = text.lower()
# tokenize text and remove puncutation
text = [word.strip(string.punctuation) for word in text.split(" ")]
# remove words that contain numbers
text = [word for word in text if not any(c.isdigit() for c in word)]
# remove stop words
stop = stopwords.words('english')
text = [x for x in text if x not in stop]
# remove empty tokens
text = [t for t in text if len(t) > 0]
# pos tag text
pos_tags = pos_tag(text)
# lemmatize text
text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
# remove words with only one letter
text = [t for t in text if len(t) > 1]
# join all
text = " ".join(text)
return(text)
# clean text data
reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment