Skip to content

Instantly share code, notes, and snippets.

@jxnl
Created September 23, 2014 00:05
Show Gist options
  • Save jxnl/c062b183d603e84fd5c2 to your computer and use it in GitHub Desktop.
Save jxnl/c062b183d603e84fd5c2 to your computer and use it in GitHub Desktop.
Generates Parody Text.
"""
This module contains a markov chain class that can be used to generate
a psuedo random list of elements from a selected corpus. It does so
by collecting all existing bigrams e.i. (element_i, element_i+1) of the
selected corpus and defining a markov chain with transition probabilities
proportional to the relative frequency of the bigrams.
Author: Jason Liu
Date: May 24th, 2014
Language
--------
While a markov chain can generate elements of any kind, weather, music, etc.
The docstrings will pertain to generating text from a corpus. We will use
'word-node' or 'primary-word' to discuss a state, and 'adjacent-node' or
'adjacent-word' to discuss the edges for a certain 'node'.
The MIT License (MIT)
Copyright (c) 2014
"""
import random
from collections import defaultdict
import regex as re
class WordNode(object):
pass
class Markov(object):
"""
Markov Chain Pseudo Text Generator
----------------------------------
This class is a Text Generator that follows the Markov Property.
It collects bigrams from text and generates a bag of adjacent words
for each word in the text. e.g. {'eat':['eggs', 'eggs', 'spam'],...}
By calling the makechain() method, it collects the bag and produces
the Markov Chain. The chain is a set of possible destinations for
each word. Each end word contains a cumilative probability, the word,
and a relative probability. e.g. {'eat': [[0.666, 'eggs', 0.666],
[1.0, 'spam', 0.333]]}
Attributes
----------
Markov.bigrams:
- A bag of words for each word-node.
Markov.chain:
- A set of words for each word-node
with a transition probaility for each edge.
"""
def __init__(self):
self.chain = {}
self.bigrams = defaultdict(defaultdict(int))
def __str__(self):
return self.get_text(self)
def collect_bigrams_from_file(self, filename):
"""Collect all bigrams to Markov.bigrams from a text file
attributes:
filename (str): name of target file
"""
with open(filename) as fline:
for line in fline:
self.collect_bigrams_from_string(line)
def collect_bigrams_from_string(self, string):
"""Collect all bigrams to Markov.bigrams from a text file
attributes:
string (str): target string
"""
text = [clean(word) for word in string.split()]
for word, adj_word in zip(text[:], text[1:]):
self.update(word, adj_word)
def update(self, word, adj_word):
"""Update a collection of bigrams
attributes:
word (str): Primary word (word-node)
adj_word (str): Adjacent word
"""
self.bigrams[word][adj_word] += 1
self.bigrams[word].update()
def count_words(self, word):
"""Count the elements affiliated with a primary word
attributes:
word (str): Primary word
"""
return sum((self.bigrams[word][w] for w in self.bigrams[word]))
def create_chain(self):
"""
Create a complete Markov.chain from Markov.bigrams
"""
for word in self.bigrams:
self.update_chain(word)
def update_chain(self, word):
"""Update a single word-node on the markov chain and rebalance
probabilities for each edge.
attributes:
word (str): word-node to update
"""
if word in self.bigrams:
total = self.count_words(word)
cumprob = int()
self.chain[word] = [()] * len(self.bigrams[word])
for i, edge in enumerate(self.bigrams[word]):
prob = self.bigrams[word][edge] / float(total)
cumprob += prob
self.chain[word][i] = (cumprob, edge, prob)
def get_next_words(self, cur_word):
"""
Step along the markov chain to produce a new word
attributes
cur_word -
word-node from which it takes the step
"""
if cur_word in self.chain:
options = self.chain[cur_word]
choice = random.random()
for cumprob, adj_word, _ in options:
if choice < cumprob:
return adj_word
else:
return None
def get_text(self, minl=10, seed=None):
"""
Random walk the markov chain to produce pseudo random text
attributes:
minl -
minimum length of a block of text
maxw -
maximun length of a block of text
seed (Optional) -
starting word of the text block, random otherwise
"""
init = seed or random.choice(self.chain.keys())
text = [clean(init, first=True)]
while True:
new_word = self.get_next_words(text[-1])
if new_word:
text.append(new_word)
elif new_word is None and len(text) < minl:
init = seed or random.choice(self.chain.keys())
text = [clean(init, first=True)]
text[-1] = clean(text[-1], last=True)
else:
return ' '.join(text)
def clean(string, first=None, last=None):
"""
Cleans up nodes
"""
if first:
string = string.title()
elif last:
string = re.sub(r',', '.', string)
else:
string = re.sub(r'[\[\]\(\)]', '', string)
return string
@jxnl
Copy link
Author

jxnl commented Nov 30, 2014

do not use this code

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment