Last active
May 24, 2018 09:13
-
-
Save gauravbansal98/7e22e613b16e5a67a1f9857b3e5d794e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_lexicon(): | |
lexicon = [] #create an empty list | |
with open('pos.txt', 'r') as f: #as we are already in the folder where file is stored, 'r' is used as we reading the file | |
lines = f.readlines() #read all the lines | |
for line in lines: | |
line = line.lower() #convert all the lines into lowercase letters | |
line = word_tokenize(line) #split the sentence into words | |
lexicon += line #add each word to lexicon | |
#repeat the same process with negative examples | |
with open('neg.txt', 'r') as f: | |
lines = f.readlines() | |
for line in lines: | |
line = line.lower() | |
line = word_tokenize(line) | |
lexicon += line | |
lexicon = [lemmatizer.lemmatize(i) for i in lexicon] #lemmatize each word in the lexicon meaning convert each word to its root | |
# so 2 words with same root can be removed so now goes and going will be | |
# converted to go | |
w_counts = Counter(lexicon) #counts the occurance of each word | |
final_lexicon = [] | |
for i in w_counts: | |
if 1000 > w_counts[i] > 50: | |
final_lexicon.append(i) | |
print(len(final_lexicon)) | |
return final_lexicon |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment