Created
April 24, 2014 08:30
-
-
Save syhw/11246521 to your computer and use it in GitHub Desktop.
Very old implementation of mine of "Gibbs Sampling for the Uninitiated" (Philip Resnik, Eric Hardisty)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os, re, random, math | |
from collections import Counter | |
""" Naive Bayes with Gibbs sampling, so it can deal with unlabeled data """ | |
# as from "Gibbs Sampling for the Uninitiated" Philip Resnik, Eric Hardisty | |
def Dirichlet(v): | |
""" takes a vector of counts v and returns a Multinomial ~ Dirichlet(v) """ | |
y = [] | |
for count in v: | |
y.append(random.gammavariate(count+0.000001, 1)) | |
s = sum(y) | |
return [e/s for e in y] | |
points = re.compile('[,.?!:;]{1}') # most simple tokenizer | |
docs = {} | |
prior_good = 0.5 | |
n_bad = 0 | |
n_good = 0 | |
for r, d, fl in os.walk('reverend_thomas/test_rss/bad/'): | |
for fname in fl: | |
print r+fname | |
txt = filter(lambda x: x != '', | |
points.sub('', open(r+fname).read()).split()) | |
docs[fname] = {'text': txt, 'label': 'bad'} | |
n_bad += 1 | |
for r, d, fl in os.walk('reverend_thomas/test_rss/good/'): | |
for fname in fl: | |
print r+fname | |
txt = filter(lambda x: x != '', | |
points.sub('', open(r+fname).read()).split()) | |
docs[fname] = {'text': txt, 'label': 'good'} | |
n_good += 1 | |
prior_good = 1.0*n_good/(n_good+n_bad) | |
words_c = {'bad': {}, 'good': {}} | |
label_doc = {} | |
#labels = {'bad': [], 'good': []} | |
# *************** init *************** | |
for docname, doc in docs.iteritems(): | |
label = 'bad' | |
if random.random() < prior_good: | |
label = 'good' | |
#labels[label].append(docname) | |
label_doc[docname] = label | |
for w in doc['text']: | |
words_c[label][w] = words_c[label].get(w, 0) + 1 | |
vocab = set(words_c['bad'].keys() + words_c['good'].keys()) | |
t_bad = [words_c['bad'].get(w, 0) for w in vocab] | |
t_good = [words_c['good'].get(w, 0) for w in vocab] | |
theta_bad = Dirichlet(t_bad) | |
theta_good = Dirichlet(t_good) | |
theta = {'bad': dict(zip(vocab, theta_bad)), | |
'good': dict(zip(vocab, theta_good))} | |
# *************** Gibbs sampling *************** | |
for run_number in range(420): | |
for docname, doc in docs.iteritems(): | |
lab = label_doc[docname] | |
c_w = Counter(doc['text']) | |
running_prod = {'bad': 0.0, 'good': 0.0} | |
for w, count in c_w.iteritems(): | |
running_prod[lab] += math.log(theta[lab][w] ** words_c[lab].get(w, 0) # + γ_{θ,i}-1 | |
/ theta[lab][w] ** (words_c[lab].get(w, 0) - count) # + γ_{θ,i}-1 | |
) | |
words_c[lab][w] -= count | |
running_prod = {'bad': math.exp(running_prod['bad']), | |
'good': math.exp(running_prod['good'])} | |
label_doc[docname] = 'unlabeled' | |
# P(L_j=x|L^(-j),θ_0,θ_1,μ) = N_x+γ_{π,x}-1/(N+γ_{π,1}+γ_{π,0}-1) | |
# * \prod_{i=1}^V θ_{x,i}^{W_{j,i}} | |
# P(L_j=x|L^(-j),thetas) ≈ N_x/N * running_prod[x] | |
c_b_g = Counter(label_doc.values()) | |
total = len(label_doc.values()) | |
val_bad = 1.0 * c_b_g['bad'] / total * running_prod['bad'] | |
val_good = 1.0 * c_b_g['good'] / total * running_prod['good'] | |
new_lab = 'bad' | |
if random.random() < val_good/(val_bad+val_good): | |
new_lab = 'good' | |
label_doc[docname] = new_lab | |
for w, count in c_w.iteritems(): | |
words_c[new_lab][w] = words_c[new_lab].get(w, 0) + count | |
t_bad = [words_c['bad'].get(w, 0) for w in vocab] | |
t_good = [words_c['good'].get(w, 0) for w in vocab] | |
theta_bad = Dirichlet(t_bad) | |
theta_good = Dirichlet(t_good) | |
theta = {'bad': dict(zip(vocab, theta_bad)), | |
'good': dict(zip(vocab, theta_good))} | |
#print theta | |
print label_doc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment