gauravbansal98 · May 24, 2018 09:13
diff --git a/Create lexicon b/Create lexicon
 def create_lexicon():
  lexicon = []                      #create an empty list
  with open('pos.txt', 'r') as f:   #as we are already in the folder where file is stored, 'r' is used as we reading the file
    lines = f.readlines()           #read all the lines 
    for line in lines:
      line = line.lower()           #convert all the lines into lowercase letters
      line = word_tokenize(line)    #split the sentence into words 
      lexicon += line               #add each word to lexicon
                                    #repeat the same process with negative examples
  with open('neg.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
      line = line.lower()
      line = word_tokenize(line)
      lexicon += line
  lexicon = [lemmatizer.lemmatize(i) for i in lexicon]        #lemmatize each word in the lexicon meaning convert each word to its root 
                                                              # so 2 words with same root can be removed so now goes and going will be 
                                                              # converted to go
  w_counts = Counter(lexicon)                                 #counts the occurance of each word    
  final_lexicon = []
  for i in w_counts:
    if 1000 > w_counts[i] > 50:                               
      final_lexicon.append(i)
  print(len(final_lexicon))
  return final_lexicon
	def create_lexicon():
	lexicon = [] #create an empty list
	with open('pos.txt', 'r') as f: #as we are already in the folder where file is stored, 'r' is used as we reading the file
	lines = f.readlines() #read all the lines
	for line in lines:
	line = line.lower() #convert all the lines into lowercase letters
	line = word_tokenize(line) #split the sentence into words
	lexicon += line #add each word to lexicon
	#repeat the same process with negative examples
	with open('neg.txt', 'r') as f:
	lines = f.readlines()
	for line in lines:
	line = line.lower()
	line = word_tokenize(line)
	lexicon += line
	lexicon = [lemmatizer.lemmatize(i) for i in lexicon] #lemmatize each word in the lexicon meaning convert each word to its root
	# so 2 words with same root can be removed so now goes and going will be
	# converted to go
	w_counts = Counter(lexicon) #counts the occurance of each word
	final_lexicon = []
	for i in w_counts:
	if 1000 > w_counts[i] > 50:
	final_lexicon.append(i)
	print(len(final_lexicon))
	return final_lexicon
No results found