Created
June 17, 2016 02:36
-
-
Save jklydev/76e2766d7506bad5ce5d83008d07eb43 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
class Ngram: | |
def __init__(self, doc, n=2): | |
self.N = n | |
self.wordlist(doc) | |
self.make_db() | |
def wordlist(self, doc): | |
with open(doc, 'r') as f: | |
self.words = f.read().split() | |
def grams(self): | |
if len(self.words) < self.N: | |
raise "Document too short" | |
else: | |
for i in range(len(self.words) - (self.N-1)): | |
chunk = self.words[i:(i+self.N)] | |
k = tuple(chunk[:-1]) | |
v = chunk[-1] | |
yield k, v | |
def make_db(self): | |
self.db = {} | |
for k, v in self.grams(): | |
if k in self.db.keys(): | |
self.db[k].append(v) | |
else: | |
self.db[k] = [v] | |
def make(self, length=20): | |
sentence = list(random.choice(self.db.keys())) | |
for _ in xrange(length): | |
state = tuple(sentence[-(self.N-1):]) | |
next_word = random.choice(self.db[state]) | |
sentence.append(next_word) | |
return ' '.join(sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment