Created
July 6, 2017 06:40
-
-
Save ratsgo/45d6eb4822ae27b01329e3b8c15c8f98 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from pandas import read_table | |
import numpy as np | |
import math | |
class NaiveBayesClassifier: | |
def __init__(self, k=0.5): | |
self.k = k | |
self.word_probs = [] | |
def load_corpus(self, path): | |
corpus = read_table(path, sep=',', encoding='utf-8') | |
corpus = np.array(corpus) | |
return corpus | |
def count_words(self, training_set): | |
# 학습데이터는 영화리뷰 본문(doc), 평점(point)으로 구성 | |
counts = defaultdict(lambda : [0, 0]) | |
for doc, point in training_set: | |
# 영화리뷰가 text일 때만 카운트 | |
if self.isNumber(doc) is False: | |
# 리뷰를 띄어쓰기 단위로 토크나이징 | |
words = doc.split() | |
for word in words: | |
counts[word][0 if point > 3.5 else 1] += 1 | |
return counts | |
def isNumber(self, s): | |
try: | |
float(s) | |
return True | |
except ValueError: | |
return False | |
def word_probabilities(self, counts, total_class0, total_class1, k): | |
# 단어의 빈도수를 [단어, p(w|긍정), p(w|부정)] 형태로 반환 | |
return [(w, | |
(class0 + k) / (total_class0 + 2*k), | |
(class1 + k) / (total_class1 + 2*k)) | |
for w, (class0, class1) in counts.items()] | |
def class0_probability(self, word_probs, doc): | |
# 별도 토크나이즈 안하고 띄어쓰기로만 | |
docwords = doc.split() | |
# 초기값은 모두 0으로 처리 | |
log_prob_if_class0 = log_prob_if_class1 = 0.0 | |
# 모든 단어에 대해 반복 | |
for word, prob_if_class0, prob_if_class1 in word_probs: | |
# 만약 리뷰에 word가 나타나면 | |
# 해당 단어가 나올 log 확률을 더해 줌 | |
if word in docwords: | |
log_prob_if_class0 += math.log(prob_if_class0) | |
log_prob_if_class1 += math.log(prob_if_class1) | |
# 만약 리뷰에 word가 나타나지 않는다면 | |
# 해당 단어가 나오지 않을 log 확률을 더해 줌 | |
# 나오지 않을 확률은 log(1-나올 확률)로 계산 | |
else: | |
log_prob_if_class0 += math.log(1.0 - prob_if_class0) | |
log_prob_if_class1 += math.log(1.0 - prob_if_class1) | |
prob_if_class0 = math.exp(log_prob_if_class0) | |
prob_if_class1 = math.exp(log_prob_if_class1) | |
return prob_if_class0 / (prob_if_class0 + prob_if_class1) | |
def train(self, trainfile_path): | |
training_set = self.load_corpus(trainfile_path) | |
# 범주0(긍정)과 범주1(부정) 문서 수를 세어 줌 | |
num_class0 = len([1 for _, point in training_set if point > 3.5]) | |
num_class1 = len(training_set) - num_class0 | |
# train | |
word_counts = self.count_words(training_set) | |
self.word_probs = self.word_probabilities(word_counts, | |
num_class0, | |
num_class1, | |
self.k) | |
def classify(self, doc): | |
return self.class0_probability(self.word_probs, doc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment