Created
April 27, 2014 02:54
-
-
Save josephcc/11336586 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from operator import * | |
| from itertools import * | |
| from collections import Counter | |
| from math import log | |
| from lib import * | |
| from copy import copy | |
| MAX_NODES = 64 -1 | |
| TREE = [None] * MAX_NODES | |
| PDFS = [None] * MAX_NODES | |
| Q = [] | |
| DESC = [] | |
| TYPE = [] | |
| Updf = None | |
| Utotal = None | |
| POS, PUNC = all_tags(train_file) | |
| TAGS = POS + PUNC | |
| TRAIN = list(corpus_reader(train_file)) | |
| TEST = list(corpus_reader(test_file)) | |
| # skip-bigram questions | |
| for tag in TAGS: | |
| for r in range(-10, 0): | |
| Q.append(exact_matcher_factory(itemgetter(r), tag)) | |
| DESC.append('W%d == %s' % (r, tag)) | |
| TYPE.append(1) | |
| pos_head = [x[0] for x in POS] | |
| pos_head = list(set(pos_head)) | |
| for pos in pos_head: | |
| for r in range(-10, 0): | |
| Q.append(exact_matcher_factory(itemgetter(r), pos, itemgetter(0))) | |
| DESC.append('W%d[0] == %s' % (r, pos)) | |
| TYPE.append(1) | |
| pos_head = filter(lambda x: len(x) > 2, POS) | |
| pos_head = [x[:2] for x in pos_head] | |
| pos_head = list(set(pos_head)) | |
| for pos in pos_head: | |
| for r in range(-10, 0): | |
| Q.append(exact_matcher_factory(itemgetter(r), pos, lambda x: x[:2])) | |
| DESC.append('W%d[:2] == %s' % (r, pos)) | |
| TYPE.append(1) | |
| skip_bigram = len(Q) | |
| print '# of skip-bigram questions:', skip_bigram | |
| # history length questions | |
| for l in range(10): | |
| Q.append(history_len_matcher_factory(l)) | |
| DESC.append('len(history) == %d' % l) | |
| TYPE.append(2) | |
| Q.append(history_len_matcher_factory(l, mode=operator.lt)) | |
| DESC.append('len(history) < %d' % l) | |
| TYPE.append(2) | |
| history_length = len(Q) - skip_bigram | |
| print '# of history length questions:', history_length | |
| # symbols in history range questions | |
| for l in range(0,20): | |
| Q.append(set_matcher_factory(lambda x: x[-l:], ',')) | |
| DESC.append('"," in W%d ~ W-1' % -l) | |
| TYPE.append(3) | |
| Q.append(set_matcher_factory(lambda x: x[-l:], ':')) | |
| DESC.append('":" in W%d ~ W-1' % -l) | |
| TYPE.append(3) | |
| Q.append(set_matcher_factory(lambda x: x[-l:], '(')) | |
| DESC.append('"(" in W%d ~ W-1' % -l) | |
| TYPE.append(3) | |
| Q.append(set_matcher_factory(lambda x: x[-l:], ')')) | |
| DESC.append('")" in W%d ~ W-1' % -l) | |
| TYPE.append(3) | |
| Q.append(set_matcher_factory(lambda x: x[-l:], '.')) | |
| DESC.append('"." in W%d ~ W-1' % -l) | |
| TYPE.append(3) | |
| symbols_range = len(Q) - skip_bigram - history_length | |
| print '# of symbols in history range questions:', symbols_range | |
| # trigram pos first letter only questions | |
| pos_head = [x[0] for x in POS] | |
| pos_head = list(set(pos_head)) | |
| for pos in product(pos_head, repeat=2): | |
| pos = ' '.join(pos) | |
| Q.append(exact_matcher_factory(lambda x: x[:2], pos, itemgetter(0))) | |
| DESC.append('W-2[0]W-1[0] == %s' % pos) | |
| TYPE.append(4) | |
| trigram_pos0 = len(Q) - skip_bigram - history_length - symbols_range | |
| print '# of trigram simplified pos tag[0] questions:', trigram_pos0 | |
| Q = list(enumerate(Q)) | |
| def build_pdf(events): | |
| total = float(len(events)) | |
| words = map(itemgetter(1), events) | |
| words = Counter(words) | |
| pdf = [(word, count/total)for word, count in words.items()] | |
| return pdf, total | |
| def H(pdf, total): | |
| h = 0 | |
| for word, prob in pdf: | |
| prob = prob + (1.0/total) | |
| h += prob * log( 1.0 / prob ) | |
| return h | |
| def CH(Tpdf, Epdf, Etotal): | |
| Epdf = dict(Epdf) | |
| h = 0 | |
| for word, prob in Tpdf: | |
| if Epdf.has_key(word): | |
| h += prob * log( 1.0 / (Epdf[word] + (1.0/Etotal)) ) | |
| else: | |
| h += prob * log( 1.0 / (1.0/Etotal) ) | |
| return h | |
| def avg_loglikelihood(pdf): | |
| if len(pdf) == 0: | |
| return 0.0 | |
| out = 0.0 | |
| for word, prob in pdf: | |
| out += log(prob) | |
| return out / len(pdf) | |
| def build_tree(TREE, node, Q, events, pdf, total, Tevents, Tpdf): | |
| if not len(Q) or node >= len(TREE): | |
| return | |
| PDFS[node] = pdf | |
| Hw = H(PDFS[node], total) | |
| Is = [] | |
| for qid, q in Q: | |
| Ey, En = filter2(q, events) | |
| if len(Ey) == 0 or len(En) == 0: | |
| Is.append(None) | |
| continue | |
| PDFy, Cy = build_pdf(Ey) | |
| PDFn, Cn = build_pdf(En) | |
| Iy = Hw - H(PDFy, Cy) | |
| In = Hw - H(PDFn, Cn) | |
| Py = float(Cy) / (Cy + Cn) | |
| Pn = float(Cn) / (Cy + Cn) | |
| score = (Py*Iy) + (Pn*In) | |
| Is.append(score) | |
| Is = list(enumerate(Is)) | |
| Is = filter(lambda x: x[1] != None, Is) | |
| Is = filter(lambda x: x[1] > 0, Is) | |
| if len(Is) == 0: | |
| return | |
| Is.sort(key=lambda x: x[1], reverse=True) | |
| idx, score = Is[0] | |
| if score == 0.0: | |
| return | |
| fp = open('Node%d.txt' % node, 'w') | |
| for idx, score in Is: | |
| qid, q = Q[idx] | |
| print >> fp, '%.5f\t%d\t%s' % (2**score, TYPE[qid], DESC[qid]) | |
| fp.close() | |
| idx, score = Is[0] | |
| qid, q = Q[idx] | |
| TREE[node] = qid | |
| del Q[idx] | |
| Ey, En = filter2(q, events) | |
| PDFy, Cy = build_pdf(Ey) | |
| PDFn, Cn = build_pdf(En) | |
| Qy = copy(Q) | |
| Qn = copy(Q) | |
| TEy, TEn = filter2(q, Tevents) | |
| TPDFy, TCy = build_pdf(TEy) | |
| TPDFn, TCn = build_pdf(TEn) | |
| THw = CH(Tpdf, pdf, total) | |
| TIy = THw - CH(TPDFy, PDFy, Cy) | |
| TIn = THw - CH(TPDFn, PDFn, Cn) | |
| try: | |
| TPy = float(TCy) / (TCy + TCn) | |
| TPn = float(TCn) / (TCy + TCn) | |
| Tscore = (TPy*TIy) + (TPn*TIn) | |
| except: | |
| Tscore = None | |
| perplexity = 2 ** score | |
| ALLy = avg_loglikelihood(PDFy) | |
| ALLn = avg_loglikelihood(PDFn) | |
| Py = float(Cy) / (Cy + Cn) | |
| Pn = float(Cn) / (Cy + Cn) | |
| ALL = (Py*ALLy) + (Pn*ALLn) | |
| level = int(log(node+1, 2)) | |
| if Tscore: | |
| Tperplexity = 2 ** Tscore | |
| TALLy = avg_loglikelihood(TPDFy) | |
| TALLn = avg_loglikelihood(TPDFn) | |
| TALL = (TPy*TALLy) + (TPn*TALLn) | |
| UHw = CH(Tpdf, Updf, Utotal) | |
| UIy = UHw - CH(TPDFy, Updf, Cy) | |
| UIn = UHw - CH(TPDFn, Updf, Cn) | |
| Uscore = (TPy*UIy) + (TPn*UIn) | |
| Uperplexity = 2 ** Uscore | |
| print 'Node%.2d level:%d Train:%.4f %.10f Truth:%.4f %.10f Uni:%.4f Q:%s' % (node, level, perplexity, ALL, Tperplexity, TALL, Uperplexity, DESC[qid]) | |
| PDFy.sort(key=lambda x: x[1], reverse=True) | |
| PDFn.sort(key=lambda x: x[1], reverse=True) | |
| print ' Y:', ' '.join(["%s:%.2f%%" % (w.rjust(4), 100*p) for w, p in PDFy[:5]]) | |
| print ' N:', ' '.join(["%s:%.2f%%" % (w.rjust(4), 100*p) for w, p in PDFn[:5]]) | |
| else: | |
| print 'Node%.2d level:%d Train:%.4f %.10f Truth:no data Uni:NoData Q:%s' % (node, level, perplexity, ALL, DESC[qid]) | |
| build_tree(TREE, Y(node), Qy, Ey, PDFy, Cy, TEy, TPDFy) | |
| build_tree(TREE, N(node), Qn, En, PDFn, Cn, TEn, TPDFn) | |
| pdf, total = build_pdf(TRAIN) | |
| Hw = H(pdf, total) | |
| ALL = avg_loglikelihood(pdf) | |
| print 'Train Entropy:', Hw | |
| print 'Train Perlexity:', 2 ** Hw | |
| print 'Train ALL:', ALL | |
| Tpdf, Ttotal = build_pdf(TEST) | |
| THw = H(Tpdf, Ttotal) | |
| TALL = avg_loglikelihood(Tpdf) | |
| print 'Test Entropy:', THw | |
| print 'Test Perlexity:', 2 ** THw | |
| print 'Test ALL:', TALL | |
| Updf = pdf | |
| Utotal = total | |
| build_tree(TREE, 0, copy(Q), copy(TRAIN), pdf, total, copy(TEST), Tpdf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment