Last active
December 21, 2015 13:48
-
-
Save justinvanwinkle/6314903 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import random | |
import sys | |
import re | |
from math import floor | |
from math import ceil | |
from collections import defaultdict | |
_tokenize = re.compile(r'((?:\w{1,20}\'\w+)|(?:\w{1,20})|(?:[.,]))') | |
def ngrams(tokens): | |
print tokens | |
n_tokens = len(tokens) | |
for i in xrange(n_tokens): | |
for j in xrange(i + 2, min(n_tokens, i + 6) + 1): | |
print tuple(tokens[i:j]) | |
yield tuple(tokens[i:j]) | |
def tokenize(s): | |
return _tokenize.findall(s) | |
splits = { | |
6: 3, | |
5: 3, | |
4: 2, | |
3: 2, | |
2: 1} | |
class Markov(object): | |
def __init__(self): | |
self.grams = defaultdict(list) | |
def add_text(self, s): | |
for gram in ngrams(tokenize(s)): | |
split = splits[len(gram)] | |
self.grams[gram[:split]].append(gram[split:]) | |
def get_next(self, current): | |
possibilities = [] | |
for gram_size in range(3): | |
test_gram = current[:-gram_size] if gram_size > 0 else current | |
if test_gram in self.grams: | |
possibilities.extend(self.grams[test_gram]) | |
for gram in self.grams[test_gram]: | |
if random.random() > .2: | |
return gram | |
if not possibilities: | |
return None | |
next_part = random.choice(possibilities) | |
return next_part | |
def make_ngram(self, seed=None, max_len=20): | |
if seed is None: | |
part = random.choice(self.grams.keys()) | |
poem = [] | |
while '.' not in part: | |
poem.extend(part) | |
part = self.get_next(tuple(poem[-3:])) | |
if part is None: | |
break | |
if part: | |
poem.extend(part) | |
if '.' in poem: | |
return poem[:poem.index('.')][:max_len] | |
return poem[:max_len] | |
def main(): | |
text = sys.stdin.read() | |
m = Markov() | |
for line in text.splitlines(): | |
m.add_text(line) | |
print ' '.join(m.make_ngram()) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment