Created
September 14, 2015 06:35
-
-
Save honnibal/7a8edc5e5fbab32985ba to your computer and use it in GitHub Desktop.
Simple but not so accurate bigram language model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from preshed.counter import PreshCounter | |
from spacy.en import English | |
from spacy.attrs import ORTH, IS_OOV | |
import plac | |
import plac | |
from os import path | |
import os | |
import bz2 | |
import ujson | |
from preshed.counter import PreshCounter | |
def encode_bigram(w1, w2, vocab_size): | |
return w1 * vocab_size + w2 | |
def decode_bigram(bigram, vocab_size): | |
w1 = bigram // vocab_size | |
w2 = bigram - w1 | |
return w1, w2 | |
def count_bigrams(docs, vocab_size): | |
counts = PreshCounter() | |
for doc in docs: | |
for w1 in doc: | |
if (w1.i + 1) < len(doc): | |
w2 = doc[w1.i + 1] | |
if not w1.is_oov and not w2.is_oov: | |
bigram = encode_bigram(w1.orth, w2.orth, vocab_size) | |
counts.inc(bigram, 1) | |
counts.smooth() | |
return counts | |
def estimate_ngram(ngram, bigrams, vocab_size): | |
prob = 1.0 | |
for i in range(len(ngram)-1): | |
w1 = ngram[i] | |
w2 = ngram[i+1] | |
prob *= bigrams.prob(w1 * vocab_size + w2) | |
return prob * bigrams.total | |
def gen_docs(tokenizer, data_loc): | |
with bz2.BZ2File(data_loc) as file_: | |
for line in file_: | |
data = ujson.loads(line) | |
doc = tokenizer(data[u'body']) | |
yield doc | |
def main(data_loc): | |
print("Load spacy") | |
nlp = English(parser=False, tagger=False) | |
vocab_size = len(nlp.vocab.strings) | |
print("setup docs") | |
docs = gen_docs(nlp.tokenizer, data_loc) | |
print("Count bigrams") | |
bigram_probs = count_bigrams(docs, vocab_size) | |
print("Get estimates") | |
likely_ngram = [t.orth for t in nlp.tokenizer(u'State of the union')] | |
unlikely_ngram = [t.orth for t in nlp.tokenizer(u'colorless green ideas sleep furiously')] | |
print(estimate_ngram(likely_ngram, bigram_probs, vocab_size)) | |
print(estimate_ngram(unlikely_ngram, bigram_probs, vocab_size)) | |
if __name__ == '__main__': | |
plac.call(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment