Skip to content

Instantly share code, notes, and snippets.

@Venkatstatistics
Created September 20, 2019 14:57
Show Gist options
  • Save Venkatstatistics/7c11dde8fd5a4451c54b0f25e5c45b5c to your computer and use it in GitHub Desktop.
Save Venkatstatistics/7c11dde8fd5a4451c54b0f25e5c45b5c to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
#lowercasing
texts=["JOHN","keLLY","ArJUN","SITA"]
lower_words=[word.lower() for word in texts]
lower_words
#Stemming
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
porter_stemmer=PorterStemmer()
words=["call","called","calling"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]
stemdf= pd.DataFrame({'original_word': words,'stemmed_word': stemmed_words})
stemdf
#Differnece between Stemming and Lemmatization
words=["geese"]
stemmed_words=[porter_stemmer.stem(word=word) for word in words]
print(stemmed_words)
words=["geese"]
lemmatized_words=[lemmatizer.lemmatize(word=word,pos='n') for word in words]
print(lemmatized_words)
##tokenization
from nltk.tokenize import word_tokenize
text = "let us learn NLP"
print(word_tokenize(text))
#stop word removal
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment