Last active
March 6, 2016 02:45
-
-
Save mumbleskates/cb9c3433422bdea98f35 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
from collections import Counter, defaultdict | |
from functools import reduce | |
from itertools import tee | |
import re | |
LOW_PROBABILITY = 1.0 / (1 << 20) | |
class LanguageModel(object): | |
def __init__(self, words=(), ngram_length=3): | |
self.ngram_length = ngram_length | |
self.table = defaultdict(Counter) | |
self.add_words(words) | |
@staticmethod | |
def words_to_ngrams(words, ngram_length): | |
its = tee(words, ngram_length) | |
# advance iterators | |
try: | |
for i in range(1, ngram_length): | |
next(zip(*its[i:])) | |
except StopIteration: | |
return | |
yield from zip(*its) | |
def add_ngram(self, gram): | |
self.table[gram[:-1]][gram[-1]] += 1 | |
def add_ngrams(self, ngrams): | |
for gram in ngrams: | |
self.add_ngram(gram) | |
def add_words(self, words): | |
self.add_ngrams(LanguageModel.words_to_ngrams( | |
words, self.ngram_length | |
)) | |
def ngram_count(self, gram): | |
return self.table[gram[:-1]][gram[-1]] | |
def leading_ngram_count(self, leading_ngram): | |
return sum(self.table[leading_ngram].values()) | |
def normalized_ngram_count(self, gram): | |
frequency = self.ngram_count(gram) | |
total = self.leading_ngram_count(gram[:-1]) | |
return 0 if total == 0 else frequency / total | |
def sentence_probability(self, words): | |
return reduce( | |
lambda r, x: r * (x or LOW_PROBABILITY), | |
( | |
self.normalized_ngram_count(*gram) or LOW_PROBABILITY | |
for gram in LanguageModel.words_to_ngrams( | |
words, self.ngram_length | |
) | |
), | |
1.0 | |
) | |
def corpora_from_file(filename): | |
with open(filename, 'r', encoding='utf-8') as file: | |
return re.split(r"\*{30,}", file.read()) | |
def corpus_to_words(corpus, splitter): | |
return splitter(corpus) | |
def load_corpora(filename, splitter=str.split): | |
return [ | |
LanguageModel(corpus_to_words(words, splitter)) | |
for words in corpora_from_file(filename) | |
] | |
if __name__ == "__main__": | |
lms = load_corpora("corpora.txt") | |
print([lm.sentence_probability(str.split("how do you do")) for lm in lms]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment