Created
June 25, 2013 01:04
-
-
Save shaldengeki/5855100 to your computer and use it in GitHub Desktop.
markov post model. horribly unoptimized but you know whatever
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import re | |
import random | |
APOSTROPHE_REGEX = re.compile("'") | |
NON_ALPHANUMERIC_REGEX = re.compile('[^a-zA-Z0-9]+') | |
def strip_tags(text, valid_tags): | |
text = bs4.BeautifulSoup(text) | |
while len(text.contents) == 1: | |
text = text.contents[0] | |
if isinstance(text, bs4.NavigableString): | |
text = text.parent | |
break | |
for tag in text.findChildren(True): | |
if tag.name in valid_tags: | |
s = "" | |
for c in tag.contents: | |
if not isinstance(c, bs4.NavigableString): | |
c = strip_tags(unicode(c), valid_tags) | |
s += unicode(c) | |
tag.replaceWith(s) | |
else: | |
tag.extract() | |
return unicode(text) | |
def tokenize(text, valid_tags=False): | |
if not valid_tags: | |
valid_tags = {'b':1, 'i':1, 'u':1} | |
else: | |
valid_tags = dict(zip(valid_tags, [1]*len(valid_tags))) if isinstance(valid_tags, list) else [] | |
text = strip_tags(text, valid_tags) | |
text = bs4.BeautifulSoup(text).text | |
#text = re.sub(APOSTROPHE_REGEX, '', text) | |
#text = re.sub(NON_ALPHANUMERIC_REGEX, ' ', text) | |
tokens = text.split() | |
return tokens | |
class MarkovModel(object): | |
def __init__(self, min_freq=0.01): | |
self.docs = [] | |
self.min_freq = float(min_freq) | |
self._tokens = self._freqs = self._model = None | |
def reset(self): | |
self.docs = [] | |
self._tokens = self._model = None | |
return self | |
def store(self, docs): | |
# stores a list of documents. | |
self.docs = (doc for doc in docs) | |
return self | |
def add(self, doc): | |
# adds a document to the store. | |
self.docs.append(doc) | |
def tokenize(self, docs=None): | |
# tokenizes the model's currently-stored posts. | |
self._tokens = {} | |
for doc in self.docs: | |
tokens = tokenize(doc) | |
numTokens = len(tokens) | |
if numTokens < 1: | |
continue | |
for i in range(numTokens): | |
if i == 0: | |
prevWord = '' | |
else: | |
prevWord = tokens[i-1] | |
if prevWord in self._tokens: | |
if tokens[i] in self._tokens[prevWord]: | |
self._tokens[prevWord][tokens[i]] += 1 | |
else: | |
self._tokens[prevWord][tokens[i]] = 1 | |
else: | |
self._tokens[prevWord] = {tokens[i]: 1} | |
if tokens[numTokens-1] not in self._tokens: | |
self._tokens[tokens[numTokens-1]] = {} | |
if '' in self._tokens[tokens[numTokens-1]]: | |
self._tokens[tokens[numTokens-1]][''] += 1 | |
else: | |
self._tokens[tokens[numTokens-1]][''] = 1 | |
return self | |
def normalize(self, min_freq=None): | |
# goes through a dict of tokens, converting raw counts to frequencies. | |
# skips any leaf nodes that fall under min_freq. | |
if min_freq is None: | |
min_freq = self.min_freq | |
if self._tokens is None: | |
self.tokenize() | |
self._freqs = {} | |
for word in self._tokens: | |
self._freqs[word] = [] | |
word_sum = sum([self._tokens[word][leaf_word] for leaf_word in self._tokens[word]]) | |
min_count = int(min_freq * word_sum) | |
# we have to recalculate the sum while we delete before we normalize. | |
word_sum = float(0) | |
filtered_words = [] | |
for leaf_word in self._tokens[word]: | |
if self._tokens[word][leaf_word] >= min_count: | |
word_sum += self._tokens[word][leaf_word] | |
filtered_words.append(leaf_word) | |
partial_sum = 0.0 | |
for leaf_word in filtered_words: | |
partial_sum += self._tokens[word][leaf_word] | |
self._freqs[word].append((leaf_word, partial_sum / word_sum)) | |
return self | |
def phrases(self, num=1, word=None): | |
# generates a markov phrase given a number of words and a seed word. | |
if self._freqs is None: | |
self.normalize() | |
random.seed() | |
for i in range(num): | |
thisWord = word | |
if thisWord is None: | |
while thisWord is None: | |
randNum = random.random() | |
for (model_word, freq) in self._freqs['']: | |
if randNum <= freq: | |
thisWord = model_word | |
break | |
sentence = [thisWord] | |
while thisWord is not '': | |
if thisWord not in self._freqs: | |
break; | |
randNum = random.random() | |
for (model_word, freq) in self._freqs[thisWord]: | |
if randNum < freq: | |
sentence.append(model_word) | |
thisWord = model_word | |
break | |
print ' '.join(sentence).strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment