Last active
August 29, 2015 13:57
-
-
Save JnBrymn/9885157 to your computer and use it in GitHub Desktop.
Simple Markov Model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import random | |
class MarkovModel(object): | |
""" | |
Takes iterator of tokens and makes a markov model of the tokens. n is the "order" of the model | |
None is a special token that serves as a sort of delimiter of phrases. | |
""" | |
@classmethod | |
def _tokenizer(cls,text,token_delim): | |
for phrase in text.split("\n"): | |
for token in phrase.split(token_delim): | |
yield token | |
yield None | |
@classmethod | |
def fromText(cls,text,token_delim=".",n=1): | |
return MarkovModel(MarkovModel._tokenizer(text,token_delim),n) | |
def __init__(self,token_iterator,n=1): | |
self.n = n | |
self.model_dict = defaultdict(lambda: {"count":0,"tokens_and_counts":defaultdict(int)}) | |
key = (None,) #this is a tuple | |
for token in token_iterator: | |
sub_dict = self.model_dict[key] | |
sub_dict["count"] += 1 | |
sub_dict["tokens_and_counts"][token] += 1 | |
key = self._shift_key(key,token) | |
self.model_dict.default_factory = lambda:None #make it so that you can't add anything new | |
def __repr__(self): | |
string = "" | |
for key,counts in self.model_dict.iteritems(): | |
string += "{0}\tcount:{1}\n".format(key,counts["count"]) | |
for token,count in counts["tokens_and_counts"].iteritems(): | |
string += "\t{0}\tcount:{1}\n".format(token,count) | |
return string | |
def generateSample(self,max_tokens=100): | |
key = (None,) | |
tokens = [] | |
for i in xrange(max_tokens): | |
sub_dict = self.model_dict[key] | |
if sub_dict is None: | |
return tokens #here we have reached a dead end | |
until = random.randint(0,sub_dict["count"]) | |
for token,count in sub_dict["tokens_and_counts"].iteritems(): | |
until -= count | |
if until <= 0: | |
if token is None: | |
return tokens #here we have reached the end of a phrase | |
tokens.append(token) | |
key = self._shift_key(key,token) | |
break | |
return tokens #here we have reached the max_tokens | |
def _shift_key(self,key,token): | |
if token is None: | |
key = (token,) | |
else: | |
key = list(key) | |
key.append(token) | |
if len(key)>self.n: | |
del(key[0]) | |
key = tuple(key) | |
return key | |
If you want to make fun of people's tweets, here's a good way to do it!
import tweepy
import os
auth = tweepy.OAuthHandler(os.getenv("TWITTER_CONSUMER_KEY"),os.getenv("TWITTER_CONSUMER_SECRET"))
auth.set_access_token(os.getenv("TWITTER_BOT_TOKEN"), os.getenv("TWITTER_BOT_SECRET"))
t = tweepy.API(auth)
def make_fun_of(screen_name,n=1):
ms=t.user_timeline( screen_name=screen_name,count=200)
text = []
for m in ms:
text.extend(m.text.split(" "))
text.append(None)
mm = MarkovModel(text,n=n)
return mm
use it this way
bbombgardener = make_fun_of("bbombgardener")
print " ".join(bbombgardener.generateSample())
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Works like this:
You can implement higher order models as well
And you can do words (or any token you wish)
Numbers work too.
None is considered a special "phrase" delimeter.
The pretty print is also not bad: