honnibal · September 14, 2015 06:35
diff --git a/simple_bigrams.py b/simple_bigrams.py
 from preshed.counter import PreshCounter
 from spacy.en import English

 from spacy.attrs import ORTH, IS_OOV

 import plac

 import plac
 from os import path
 import os
 import bz2
 import ujson
 from preshed.counter import PreshCounter


 def encode_bigram(w1, w2, vocab_size):
    return w1 * vocab_size + w2


 def decode_bigram(bigram, vocab_size):
    w1 = bigram // vocab_size
    w2 = bigram - w1
    return w1, w2


 def count_bigrams(docs, vocab_size):
    counts = PreshCounter()
    for doc in docs:
        for w1 in doc:
            if (w1.i + 1) < len(doc):
                w2 = doc[w1.i + 1]
                if not w1.is_oov and not w2.is_oov:
                    bigram = encode_bigram(w1.orth, w2.orth, vocab_size)
                    counts.inc(bigram, 1)
    counts.smooth()
    return counts


 def estimate_ngram(ngram, bigrams, vocab_size):
    prob = 1.0
    for i in range(len(ngram)-1):
        w1 = ngram[i]
        w2 = ngram[i+1]
        prob *= bigrams.prob(w1 * vocab_size + w2)
    return prob * bigrams.total


 def gen_docs(tokenizer, data_loc):
    with bz2.BZ2File(data_loc) as file_:
        for line in file_:
            data = ujson.loads(line)
            doc = tokenizer(data[u'body'])
            yield doc


 def main(data_loc):
    print("Load spacy")
    nlp = English(parser=False, tagger=False)
    vocab_size = len(nlp.vocab.strings)
    print("setup docs")
    docs = gen_docs(nlp.tokenizer, data_loc)
    print("Count bigrams")
    bigram_probs = count_bigrams(docs, vocab_size)
    print("Get estimates")
    likely_ngram = [t.orth for t in nlp.tokenizer(u'State of the union')]
    unlikely_ngram = [t.orth for t in nlp.tokenizer(u'colorless green ideas sleep furiously')]
    print(estimate_ngram(likely_ngram, bigram_probs, vocab_size))
    print(estimate_ngram(unlikely_ngram, bigram_probs, vocab_size))


 if __name__ == '__main__':
    plac.call(main)
	from preshed.counter import PreshCounter
	from spacy.en import English

	from spacy.attrs import ORTH, IS_OOV

	import plac

	import plac
	from os import path
	import os
	import bz2
	import ujson
	from preshed.counter import PreshCounter


	def encode_bigram(w1, w2, vocab_size):
	return w1 * vocab_size + w2


	def decode_bigram(bigram, vocab_size):
	w1 = bigram // vocab_size
	w2 = bigram - w1
	return w1, w2


	def count_bigrams(docs, vocab_size):
	counts = PreshCounter()
	for doc in docs:
	for w1 in doc:
	if (w1.i + 1) < len(doc):
	w2 = doc[w1.i + 1]
	if not w1.is_oov and not w2.is_oov:
	bigram = encode_bigram(w1.orth, w2.orth, vocab_size)
	counts.inc(bigram, 1)
	counts.smooth()
	return counts


	def estimate_ngram(ngram, bigrams, vocab_size):
	prob = 1.0
	for i in range(len(ngram)-1):
	w1 = ngram[i]
	w2 = ngram[i+1]
	prob = bigrams.prob(w1 vocab_size + w2)
	return prob * bigrams.total


	def gen_docs(tokenizer, data_loc):
	with bz2.BZ2File(data_loc) as file_:
	for line in file_:
	data = ujson.loads(line)
	doc = tokenizer(data[u'body'])
	yield doc


	def main(data_loc):
	print("Load spacy")
	nlp = English(parser=False, tagger=False)
	vocab_size = len(nlp.vocab.strings)
	print("setup docs")
	docs = gen_docs(nlp.tokenizer, data_loc)
	print("Count bigrams")
	bigram_probs = count_bigrams(docs, vocab_size)
	print("Get estimates")
	likely_ngram = [t.orth for t in nlp.tokenizer(u'State of the union')]
	unlikely_ngram = [t.orth for t in nlp.tokenizer(u'colorless green ideas sleep furiously')]
	print(estimate_ngram(likely_ngram, bigram_probs, vocab_size))
	print(estimate_ngram(unlikely_ngram, bigram_probs, vocab_size))


	if __name__ == '__main__':
	plac.call(main)
No results found