-
-
Save amirinia/3eb2cc310d3fda3a0c850d60e40ee735 to your computer and use it in GitHub Desktop.
Code to generate bag of word vectors in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import statments | |
import numpy | |
import re | |
''' | |
Tokenize each the sentences, example | |
Input : "John likes to watch movies. Mary likes movies too" | |
Ouput : "John","likes","to","watch","movies","Mary","likes","movies","too" | |
''' | |
def tokenize(sentences): | |
words = [] | |
for sentence in sentences: | |
w = word_extraction(sentence) | |
words.extend(w) | |
words = sorted(list(set(words))) | |
return words | |
def word_extraction(sentence): | |
ignore = ['a', "the", "is"] | |
words = re.sub("[^\w]", " ", sentence).split() | |
cleaned_text = [w.lower() for w in words if w not in ignore] | |
return cleaned_text | |
def generate_bow(allsentences): | |
vocab = tokenize(allsentences) | |
print("Word List for Document \n{0} \n".format(vocab)); | |
for sentence in allsentences: | |
words = word_extraction(sentence) | |
bag_vector = numpy.zeros(len(vocab)) | |
for w in words: | |
for i,word in enumerate(vocab): | |
if word == w: | |
bag_vector[i] += 1 | |
print("{0} \n{1}\n".format(sentence,numpy.array(bag_vector))) | |
allsentences = ["Joe waited for the train", "The train was late", "Mary and Samantha took the bus", | |
"I looked for Mary and Samantha at the bus station", | |
"Mary and Samantha arrived at the bus station early but waited until noon for the bus"] | |
generate_bow(allsentences) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment