Last active
May 24, 2018 08:07
-
-
Save gauravbansal98/1f1425fd392d897a03d823e961e4f9d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def feature_vectors(input_file_name,lexicon,classification): #classification is [0, 1] for positive and [1, 0] for negative | |
#it is similar to 0 for negative and 1 for positives | |
featureset = [] #creating empty list | |
with open(input_file_name,'r') as f: | |
contents = f.readlines() | |
for l in contents[:hm_lines]: #number of lines we need to process | |
current_words = word_tokenize(l.lower()) #converting sentence to lowercase and then splitting it to words | |
current_words = [lemmatizer.lemmatize(i) for i in current_words] | |
features = np.zeros(len(lexicon)) #creating a feature vector equal to the length of the lexicon | |
for word in current_words: | |
if word.lower() in lexicon: #if word is present in the lexicon then we find the position of that | |
index_value = lexicon.index(word.lower()) #word in lexicon and make the element correspond to that word 1 in | |
features[index_value] += 1 #feature vector | |
features = list(features) | |
featureset.append([features,classification]) #append the feature vector with its classification into the featureset | |
#a thing to notice is that now featureset is a list of list where first | |
#element is the feature vector and the second element is classifier | |
return featureset |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment