Last active
May 20, 2020 15:22
-
-
Save DanielOX/69522f1ddee14237db330e9e521c00ef to your computer and use it in GitHub Desktop.
Natural Language Feature Extraction | Bag of Words (with | using) NLTK Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import string | |
from collections import defaultdict | |
# Sample gutenberg corpus loaded from nltk.corpus | |
corpus = " ".join(nltk.corpus.gutenberg.words('austen-emma.txt')) | |
# Tokenize corpus into sentences | |
def sent_tokenize(corpus): | |
return [ sentence for sentence in nltk.sent_tokenize(corpus) ] | |
# Yield each tokenized words from a sentence | |
def tokenize(sentence): | |
stem = nltk.stem.SnowballStemmer('english') | |
sentence = sentence.lower() | |
for word in nltk.word_tokenize(sentence): | |
if word not in string.punctuation: | |
yield stem.stem(word) | |
# Vectorize the whole corpus | |
def vectorize(corpus): | |
features = defaultdict(int) # Default value set to zero for each unknown word / token | |
for token in tokenize(corpus): | |
features[token] += 1 | |
return features | |
# Map the corpus to words | |
vectors = map(vectorize,sent_tokenize(corpus)) | |
# vectors holds all the features with frequency of each word |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment