Created
October 17, 2021 11:45
-
-
Save vinimonteiro/2c6e1efabf70ddb3312c404f801f7c81 to your computer and use it in GitHub Desktop.
Stop word removal using NLKT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
sentence = """Clairson International Corp. said it expects to report a | |
net loss for its second quarter ended March 26 and doesn't expect to meet analysts' profit | |
estimates of $3.0 to $4 million, or | |
1,276 cents a share to 1,279 cents a share, for its year ending Sept. 24.""" | |
stop_words = set(stopwords.words('english')) | |
word_tokens = word_tokenize(sentence) | |
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words] | |
filtered_sentence = [] | |
for w in word_tokens: | |
if w not in stop_words: | |
filtered_sentence.append(w) | |
print(word_tokens) | |
print(filtered_sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment