Skip to content

Instantly share code, notes, and snippets.

@josephcc
Created April 27, 2014 02:54
Show Gist options
  • Select an option

  • Save josephcc/11336586 to your computer and use it in GitHub Desktop.

Select an option

Save josephcc/11336586 to your computer and use it in GitHub Desktop.
from operator import *
from itertools import *
from collections import Counter
from math import log
from lib import *
from copy import copy
MAX_NODES = 64 -1
TREE = [None] * MAX_NODES
PDFS = [None] * MAX_NODES
Q = []
DESC = []
TYPE = []
Updf = None
Utotal = None
POS, PUNC = all_tags(train_file)
TAGS = POS + PUNC
TRAIN = list(corpus_reader(train_file))
TEST = list(corpus_reader(test_file))
# skip-bigram questions
for tag in TAGS:
for r in range(-10, 0):
Q.append(exact_matcher_factory(itemgetter(r), tag))
DESC.append('W%d == %s' % (r, tag))
TYPE.append(1)
pos_head = [x[0] for x in POS]
pos_head = list(set(pos_head))
for pos in pos_head:
for r in range(-10, 0):
Q.append(exact_matcher_factory(itemgetter(r), pos, itemgetter(0)))
DESC.append('W%d[0] == %s' % (r, pos))
TYPE.append(1)
pos_head = filter(lambda x: len(x) > 2, POS)
pos_head = [x[:2] for x in pos_head]
pos_head = list(set(pos_head))
for pos in pos_head:
for r in range(-10, 0):
Q.append(exact_matcher_factory(itemgetter(r), pos, lambda x: x[:2]))
DESC.append('W%d[:2] == %s' % (r, pos))
TYPE.append(1)
skip_bigram = len(Q)
print '# of skip-bigram questions:', skip_bigram
# history length questions
for l in range(10):
Q.append(history_len_matcher_factory(l))
DESC.append('len(history) == %d' % l)
TYPE.append(2)
Q.append(history_len_matcher_factory(l, mode=operator.lt))
DESC.append('len(history) < %d' % l)
TYPE.append(2)
history_length = len(Q) - skip_bigram
print '# of history length questions:', history_length
# symbols in history range questions
for l in range(0,20):
Q.append(set_matcher_factory(lambda x: x[-l:], ','))
DESC.append('"," in W%d ~ W-1' % -l)
TYPE.append(3)
Q.append(set_matcher_factory(lambda x: x[-l:], ':'))
DESC.append('":" in W%d ~ W-1' % -l)
TYPE.append(3)
Q.append(set_matcher_factory(lambda x: x[-l:], '('))
DESC.append('"(" in W%d ~ W-1' % -l)
TYPE.append(3)
Q.append(set_matcher_factory(lambda x: x[-l:], ')'))
DESC.append('")" in W%d ~ W-1' % -l)
TYPE.append(3)
Q.append(set_matcher_factory(lambda x: x[-l:], '.'))
DESC.append('"." in W%d ~ W-1' % -l)
TYPE.append(3)
symbols_range = len(Q) - skip_bigram - history_length
print '# of symbols in history range questions:', symbols_range
# trigram pos first letter only questions
pos_head = [x[0] for x in POS]
pos_head = list(set(pos_head))
for pos in product(pos_head, repeat=2):
pos = ' '.join(pos)
Q.append(exact_matcher_factory(lambda x: x[:2], pos, itemgetter(0)))
DESC.append('W-2[0]W-1[0] == %s' % pos)
TYPE.append(4)
trigram_pos0 = len(Q) - skip_bigram - history_length - symbols_range
print '# of trigram simplified pos tag[0] questions:', trigram_pos0
Q = list(enumerate(Q))
def build_pdf(events):
total = float(len(events))
words = map(itemgetter(1), events)
words = Counter(words)
pdf = [(word, count/total)for word, count in words.items()]
return pdf, total
def H(pdf, total):
h = 0
for word, prob in pdf:
prob = prob + (1.0/total)
h += prob * log( 1.0 / prob )
return h
def CH(Tpdf, Epdf, Etotal):
Epdf = dict(Epdf)
h = 0
for word, prob in Tpdf:
if Epdf.has_key(word):
h += prob * log( 1.0 / (Epdf[word] + (1.0/Etotal)) )
else:
h += prob * log( 1.0 / (1.0/Etotal) )
return h
def avg_loglikelihood(pdf):
if len(pdf) == 0:
return 0.0
out = 0.0
for word, prob in pdf:
out += log(prob)
return out / len(pdf)
def build_tree(TREE, node, Q, events, pdf, total, Tevents, Tpdf):
if not len(Q) or node >= len(TREE):
return
PDFS[node] = pdf
Hw = H(PDFS[node], total)
Is = []
for qid, q in Q:
Ey, En = filter2(q, events)
if len(Ey) == 0 or len(En) == 0:
Is.append(None)
continue
PDFy, Cy = build_pdf(Ey)
PDFn, Cn = build_pdf(En)
Iy = Hw - H(PDFy, Cy)
In = Hw - H(PDFn, Cn)
Py = float(Cy) / (Cy + Cn)
Pn = float(Cn) / (Cy + Cn)
score = (Py*Iy) + (Pn*In)
Is.append(score)
Is = list(enumerate(Is))
Is = filter(lambda x: x[1] != None, Is)
Is = filter(lambda x: x[1] > 0, Is)
if len(Is) == 0:
return
Is.sort(key=lambda x: x[1], reverse=True)
idx, score = Is[0]
if score == 0.0:
return
fp = open('Node%d.txt' % node, 'w')
for idx, score in Is:
qid, q = Q[idx]
print >> fp, '%.5f\t%d\t%s' % (2**score, TYPE[qid], DESC[qid])
fp.close()
idx, score = Is[0]
qid, q = Q[idx]
TREE[node] = qid
del Q[idx]
Ey, En = filter2(q, events)
PDFy, Cy = build_pdf(Ey)
PDFn, Cn = build_pdf(En)
Qy = copy(Q)
Qn = copy(Q)
TEy, TEn = filter2(q, Tevents)
TPDFy, TCy = build_pdf(TEy)
TPDFn, TCn = build_pdf(TEn)
THw = CH(Tpdf, pdf, total)
TIy = THw - CH(TPDFy, PDFy, Cy)
TIn = THw - CH(TPDFn, PDFn, Cn)
try:
TPy = float(TCy) / (TCy + TCn)
TPn = float(TCn) / (TCy + TCn)
Tscore = (TPy*TIy) + (TPn*TIn)
except:
Tscore = None
perplexity = 2 ** score
ALLy = avg_loglikelihood(PDFy)
ALLn = avg_loglikelihood(PDFn)
Py = float(Cy) / (Cy + Cn)
Pn = float(Cn) / (Cy + Cn)
ALL = (Py*ALLy) + (Pn*ALLn)
level = int(log(node+1, 2))
if Tscore:
Tperplexity = 2 ** Tscore
TALLy = avg_loglikelihood(TPDFy)
TALLn = avg_loglikelihood(TPDFn)
TALL = (TPy*TALLy) + (TPn*TALLn)
UHw = CH(Tpdf, Updf, Utotal)
UIy = UHw - CH(TPDFy, Updf, Cy)
UIn = UHw - CH(TPDFn, Updf, Cn)
Uscore = (TPy*UIy) + (TPn*UIn)
Uperplexity = 2 ** Uscore
print 'Node%.2d level:%d Train:%.4f %.10f Truth:%.4f %.10f Uni:%.4f Q:%s' % (node, level, perplexity, ALL, Tperplexity, TALL, Uperplexity, DESC[qid])
PDFy.sort(key=lambda x: x[1], reverse=True)
PDFn.sort(key=lambda x: x[1], reverse=True)
print ' Y:', ' '.join(["%s:%.2f%%" % (w.rjust(4), 100*p) for w, p in PDFy[:5]])
print ' N:', ' '.join(["%s:%.2f%%" % (w.rjust(4), 100*p) for w, p in PDFn[:5]])
else:
print 'Node%.2d level:%d Train:%.4f %.10f Truth:no data Uni:NoData Q:%s' % (node, level, perplexity, ALL, DESC[qid])
build_tree(TREE, Y(node), Qy, Ey, PDFy, Cy, TEy, TPDFy)
build_tree(TREE, N(node), Qn, En, PDFn, Cn, TEn, TPDFn)
pdf, total = build_pdf(TRAIN)
Hw = H(pdf, total)
ALL = avg_loglikelihood(pdf)
print 'Train Entropy:', Hw
print 'Train Perlexity:', 2 ** Hw
print 'Train ALL:', ALL
Tpdf, Ttotal = build_pdf(TEST)
THw = H(Tpdf, Ttotal)
TALL = avg_loglikelihood(Tpdf)
print 'Test Entropy:', THw
print 'Test Perlexity:', 2 ** THw
print 'Test ALL:', TALL
Updf = pdf
Utotal = total
build_tree(TREE, 0, copy(Q), copy(TRAIN), pdf, total, copy(TEST), Tpdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment