Last active
August 29, 2015 13:57
-
-
Save ikegami-yukino/9920280 to your computer and use it in GitHub Desktop.
Standard Naive Bayes and Complement Naive Bayes using madoka
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
import numpy as np | |
from collections import Counter, defaultdict | |
import madoka | |
NUM_DOCS_INDEX = '[[NUM_DOCS]]' | |
ALL_WORD_INDEX = '[[ALL]]' | |
class TFIDF(object): | |
def __init__(self, filename=None): | |
""" | |
Params: | |
<float> alpha : hyperparameter for smoothing | |
""" | |
self.df = madoka.Sketch() | |
if filename: | |
self.df.load(filename) | |
def save(self, filename): | |
self.df.save(filename) | |
def add(self, doc): | |
"""Update DF table | |
""" | |
self.df.inc(NUM_DOCS_INDEX, len(NUM_DOCS_INDEX)) | |
for word in set(doc): | |
self.df.inc(word, len(word)) | |
def get_df_as_log(self, word): | |
return np.log(self.df.get(word, len(word))) | |
def tfidf(self, doc, mode='tfidf', update_table=True): | |
mode = mode.lower() | |
if update_table: | |
self.add(doc) | |
if mode == 'tfidf': | |
num_docs = self.get_df_as_log(NUM_DOCS_INDEX) | |
num_words = float(len(doc)) | |
result = {} | |
for word in set(doc): | |
tf = doc.count(word) / num_words | |
if mode == 'tf': | |
result[word] = tf | |
else: | |
idf = num_docs - self.get_df_as_log(word) + 1 | |
result[word] = tf * idf | |
return result | |
class MadokaNaiveBayes(object): | |
def __init__(self, alpha=1.0): | |
""" | |
Params: | |
<float> alpha : hyperparameter for smoothing | |
<int> round : digit for rounding. higher digit, higher accuracy | |
""" | |
self.alpha = alpha | |
self.voca = 0.0 | |
self.word_counter = madoka.CroquisDouble() | |
self.cat_counter = Counter() | |
def _count_word(self, word, val, cat): | |
already = False | |
for cate in self.cat_counter.keys(): | |
idx = '%s/%s' % (cate, word) | |
if self.word_counter.get(idx): | |
already = True | |
break | |
if already: | |
self.voca += 1 | |
idx = '%s/%s' % (cat, word) | |
self.word_counter.add(idx, val) | |
all_cat_word_idx = '%s/%s' % (cat, ALL_WORD_INDEX) | |
self.word_counter.add(all_cat_word_idx, val) | |
def train(self, doc, cat): | |
for (word, val) in doc.items(): | |
self._count_word(word, val, cat) | |
self.cat_counter[cat] += 1 | |
def _pr_category(self, cat): | |
"""Prior probability about category: Pr(c)""" | |
return float(self.cat_counter[cat]) / np.sum(self.cat_counter.values()) | |
def _freq_word_incategory(self, word, cat): | |
"""Frequency of a word in a category""" | |
idx = '%s/%s' % (cat, word) | |
return self.word_counter.get(idx) | |
def _calc_theta(self, word, cat): | |
"""θ = Pr(w|c)""" | |
idx = '%s/%s' % (cat, ALL_WORD_INDEX) | |
prob = (self._freq_word_incategory(word, cat) + self.alpha) / \ | |
(self.word_counter.get(idx) + self.voca) | |
return prob | |
def _calc_prob(self, doc, cat): | |
"""log(Pr(c)) + sum(tfidf * log(Pr(w|c)))""" | |
_calc_prob = np.log(self._pr_category(cat)) | |
for (word, val) in doc.items(): | |
_calc_prob += val * np.log(self._calc_theta(word, cat)) | |
return _calc_prob | |
def classify(self, doc): | |
result = Counter() | |
for cat in self.cat_counter.keys(): | |
result[cat] = self._calc_prob(doc, cat) | |
return result | |
def log_to_prob(self, data): | |
total = np.sum(data.values()) | |
for k,v in data.items(): | |
data[k] = v / total | |
return data | |
class MadokaCNaiveBayes(MadokaNaiveBayes): | |
def _calc_prob(self, doc, cat, theta): | |
"""log(Pr(c)) + sum(tfidf * log(Pr(w|c)))""" | |
prior = np.log(self._pr_category(cat)) | |
prob = 0 | |
for (word, tfidf) in doc.items(): | |
filtered_theta = [x[1] for x in theta[word].items() if x[0] != cat] | |
theta_sum = np.sum(filtered_theta) | |
prob += tfidf * np.log(theta_sum) | |
return prior - prob | |
def classify(self, doc): | |
theta = defaultdict(dict) | |
for word in doc.keys(): | |
for cat in self.cat_counter.keys(): | |
theta[word][cat] = self._calc_theta(word, cat) | |
result = Counter() | |
for cat in self.cat_counter.keys(): | |
result[cat] = self._calc_prob(doc, cat, theta) | |
return result | |
mb = MadokaCNaiveBayes() | |
t = TFIDF() | |
texts = ( | |
('CS', ('Python', 'プログラミング言語')), | |
('CS', ('Ruby', 'プログラミング言語')), | |
('CS', ('Python', '自然言語処理')), | |
('BB', ('イチロー', 'ホームラン')), | |
('BB', ('打者', '盗塁', 'ホームラン')), | |
('FOOD', ('ラーメン', 'カレー', 'スパゲッティ')), | |
('FOOD', ('ラーメン', '二郎', 'もやし')), | |
) | |
for text in texts: | |
doc = text[1] | |
doc = t.add(doc) | |
for text in texts: | |
cat = text[0] | |
doc = text[1] | |
doc = t.tfidf(doc) | |
mb.train(doc, cat) | |
print mb.classify(t.tfidf(('Python', 'Ruby'))) | |
print mb.classify(t.tfidf(('イチロー', '打者'))) | |
print mb.classify(t.tfidf(('ラーメン', 'もやし'))) | |
r = mb.classify(t.tfidf(('Python', 'Ruby'))) | |
for k,v in mb.log_to_prob(r).items(): | |
print k, v | |
r = mb.classify(t.tfidf(('イチロー', '打者'))) | |
for k,v in mb.log_to_prob(r).items(): | |
print k, v | |
r = mb.classify(t.tfidf(('ラーメン', 'もやし', '二郎'))) | |
for k,v in mb.log_to_prob(r).items(): | |
print k, v | |
r = mb.classify(t.tfidf(('ラーメン', 'Perl'))) | |
for k,v in mb.log_to_prob(r).items(): | |
print k, v |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment