Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save iamutkarshtiwari/b3cf13a77ef84e50fcbcaed2a883701e to your computer and use it in GitHub Desktop.
Save iamutkarshtiwari/b3cf13a77ef84e50fcbcaed2a883701e to your computer and use it in GitHub Desktop.
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
import scipy
import numpy
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
p_stemmer = PorterStemmer()
en_stop = get_stop_words('en')
tokenizer = RegexpTokenizer(r'\w+')
raw = doc_a.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
print(stopped_tokens)
texts = [p_stemmer.stem(i) for i in stopped_tokens]
dictionary = corpora.Dictionary([doc_a.split()])
corpus = dictionary.doc2bow(stopped_tokens)
#corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment