Created
August 23, 2013 02:21
-
-
Save markrwilliams/6314922 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import random | |
from itertools import tee, izip_longest | |
from collections import defaultdict | |
class Occurrences(object): | |
def __init__(self, personal_count=0, successor_count=None): | |
self.personal_count = 0 | |
self.successor_count = defaultdict(int) | |
if successor_count: | |
self.successor_count.update(successor_count) | |
@property | |
def nextrandom(self): | |
upper = random.uniform(0, self.personal_count) | |
until = 0 | |
for word, count in self.successor_count.iteritems(): | |
if until + count > upper: | |
return word | |
until += count | |
def asdict(self): | |
return {'personal_count': self.personal_count, | |
'successor_count': self.successor_count} | |
def __repr__(self): | |
return ('Occurrences(personal_count={}, ' | |
'successor_count={})'.format(self.personal_count, | |
self.successor_count)) | |
class Markov(object): | |
def __init__(self, probabilities=None, start_words=None): | |
self.probabilities = defaultdict(Occurrences) | |
self.start_words = set() | |
if probabilities: | |
for w, data in probabilities.iteritems(): | |
self.probabilities[w] = Occurrences(**data) | |
if start_words: | |
self.start_words.update(start_words) | |
def update(self, sentence): | |
a, b = tee(sentence.split()) | |
self.start_words.add(next(b)) | |
for cur, follow in izip_longest(a, b, fillvalue=None): | |
o = self.probabilities[cur] | |
o.personal_count += 1 | |
if follow: | |
o.successor_count[follow] += 1 | |
def generate(self): | |
length = random.randint(2, len(self.probabilities) // 2) | |
firsts = list(self.start_words) | |
if not firsts: | |
return "no data yet :(" | |
word = random.choice(firsts) | |
sentence = [word] | |
for _ in xrange(length): | |
word = self.probabilities[word].nextrandom | |
if word is None: | |
break | |
sentence.append(word) | |
return ' '.join(sentence) | |
def asdict(self): | |
return {'probabilities': {w: c.asdict() | |
for w, c in self.probabilities.iteritems()}, | |
'start_words': list(self.start_words)} | |
if __name__ == '__main__': | |
import sys, json | |
m = Markov() | |
with open(sys.argv[1]) as f: | |
m.update(f.read()) | |
print m.generate() | |
saved = m.asdict() | |
print json.dumps(saved) | |
print Markov(**saved) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment