Created
October 27, 2019 17:06
-
-
Save aipi/de03e97e03710f43c660f7e08a5e4de6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import random | |
class Dataset: | |
def remove_stop_words(self, text): | |
return text.replace(',', '').replace('.', '') | |
def get_portuguese_text(self, row, dataset): | |
portuguese_text = self.remove_stop_words(row[2].lower()).split() | |
for word in portuguese_text: | |
dataset.append((word, row[-1])) | |
return dataset | |
def execute(self): | |
dataset = [] | |
with open('imdb-reviews-pt-br.csv', newline='') as csvfile: | |
file = csv.reader( | |
csvfile, | |
delimiter=',', | |
quotechar='"', | |
quoting=csv.QUOTE_MINIMAL | |
) | |
test = [] | |
x = 0 | |
for row in file: | |
test.append(row) | |
random.shuffle(test) | |
for row in test[:100]: | |
self.get_portuguese_text(row, dataset) | |
dataset.remove(dataset[0]) | |
return dataset | |
class NaiveBaivys: | |
def __init__(self, dataset: list, text: str): | |
self.dataset = dataset | |
self.text = text.split() | |
def calculate_total_word_in_sentiment(self, word, sentiment: str): | |
total_word_count_in_text = 0 | |
for data in self.dataset: | |
if data[0] == word and data[1] == sentiment: | |
total_word_count_in_text += 1 | |
return total_word_count_in_text | |
def calculate_total_terms_in_sentiment(self, sentiment: str): | |
return len([d for d in self.dataset if d[1] == sentiment]) | |
def calculate_total_terms_in_text(self): | |
terms = [] | |
for data in self.dataset: | |
if data[0] not in terms: | |
terms.append(data[0]) | |
return len(terms) | |
def laplace_smoothing(self, word: str, sentiment: str): | |
total_word_in_sentiment = self.calculate_total_word_in_sentiment( | |
word, sentiment | |
) | |
total_terms_in_sentiment = self.calculate_total_terms_in_sentiment( | |
sentiment | |
) | |
total_terms_in_text = self.calculate_total_terms_in_text() | |
return ( | |
(total_word_in_sentiment + 1) / | |
(total_terms_in_sentiment + total_terms_in_text) | |
) | |
def sentiment_probability(self, sentiment: str): | |
total = 0 | |
for data in self.dataset: | |
if sentiment == data[1]: | |
total += 1 | |
return total / len(self.dataset) | |
def execute(self): | |
sentiments = [['neg', 1], ['pos', 1]] | |
for sentiment in sentiments: | |
for word in self.text: | |
sentiment[1] *= self.laplace_smoothing( | |
word=word, | |
sentiment=sentiment[0] | |
) | |
sentiment[1] *= self.sentiment_probability(sentiment[0]) | |
return 'neg' if sentiments[0][1] > sentiments[1][1] else 'pos' | |
if __name__ == '__main__': | |
dataset = Dataset().execute() | |
naive_baivys_sport = NaiveBaivys( | |
dataset=dataset, | |
text='', | |
) | |
print(naive_baivys_sport.execute()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment