gauravbansal98 · May 24, 2018 08:07
diff --git a/Create feature vector b/Create feature vector
 def feature_vectors(input_file_name,lexicon,classification):        #classification is [0, 1] for positive and [1, 0] for negative
                                                                    #it is similar to 0 for negative and 1 for positives

  featureset = []                                                   #creating empty list

  with open(input_file_name,'r') as f:
    contents = f.readlines()
    for l in contents[:hm_lines]:                                   #number of lines we need to process
      current_words = word_tokenize(l.lower())                      #converting sentence to lowercase and then splitting it to words
      current_words = [lemmatizer.lemmatize(i) for i in current_words]
      features = np.zeros(len(lexicon))                              #creating a feature vector equal to the length of the lexicon
      for word in current_words:
        if word.lower() in lexicon:                                  #if word is present in the lexicon then we find the position of that
          index_value = lexicon.index(word.lower())                  #word in lexicon and make the element correspond to that word 1 in
          features[index_value] += 1                                 #feature vector
          
      features = list(features)           
      
      featureset.append([features,classification])                  #append the feature vector with its classification into the featureset 
                                                                    #a thing to notice is that now featureset is a list of list where first
                                                                    #element is the feature vector and the second element is classifier
  
  return featureset
	def feature_vectors(input_file_name,lexicon,classification): #classification is [0, 1] for positive and [1, 0] for negative
	#it is similar to 0 for negative and 1 for positives

	featureset = [] #creating empty list

	with open(input_file_name,'r') as f:
	contents = f.readlines()
	for l in contents[:hm_lines]: #number of lines we need to process
	current_words = word_tokenize(l.lower()) #converting sentence to lowercase and then splitting it to words
	current_words = [lemmatizer.lemmatize(i) for i in current_words]
	features = np.zeros(len(lexicon)) #creating a feature vector equal to the length of the lexicon
	for word in current_words:
	if word.lower() in lexicon: #if word is present in the lexicon then we find the position of that
	index_value = lexicon.index(word.lower()) #word in lexicon and make the element correspond to that word 1 in
	features[index_value] += 1 #feature vector

	features = list(features)

	featureset.append([features,classification]) #append the feature vector with its classification into the featureset
	#a thing to notice is that now featureset is a list of list where first
	#element is the feature vector and the second element is classifier

	return featureset