jxnl · September 23, 2014 00:05 · jxnl · Nov 30, 2014
diff --git a/markov.py b/markov.py
 """
 This module contains a markov chain class that can be used to generate
 a psuedo random list of elements from a selected corpus. It does so
 by collecting all existing bigrams e.i. (element_i, element_i+1) of the
 selected corpus and defining a markov chain with transition probabilities
 proportional to the relative frequency of the bigrams.

 Author: Jason Liu
 Date: May 24th, 2014

 Language
 --------
 While a markov chain can generate elements of any kind, weather, music, etc.
 The docstrings will pertain to generating text from a corpus. We will use
 'word-node' or 'primary-word' to discuss a state, and 'adjacent-node' or
 'adjacent-word' to discuss the edges for a certain 'node'.

 The MIT License (MIT)
 Copyright (c) 2014
 """

 import random
 from collections import defaultdict
 import regex as re

 class WordNode(object):
    pass


 class Markov(object):
    """
    Markov Chain Pseudo Text Generator
    ----------------------------------
    This class is a Text Generator that follows the Markov Property.
    It collects bigrams from text and generates a bag of adjacent words
    for each word in the text. e.g. {'eat':['eggs', 'eggs', 'spam'],...}

    By calling the makechain() method, it collects the bag and produces
    the Markov Chain. The chain is a set of possible destinations for
    each word. Each end word contains a cumilative probability, the word,
    and a relative probability. e.g. {'eat': [[0.666, 'eggs', 0.666],
                                              [1.0, 'spam', 0.333]]}

    Attributes
    ----------
    Markov.bigrams:
        - A bag of words for each word-node.
    Markov.chain:
        - A set of words for each word-node
          with a transition probaility for each edge.
    """

    def __init__(self):
        self.chain = {}
        self.bigrams = defaultdict(defaultdict(int))

    def __str__(self):
        return self.get_text(self)

    def collect_bigrams_from_file(self, filename):
        """Collect all bigrams to Markov.bigrams from a text file

        attributes:
            filename (str): name of target file

        """
        with open(filename) as fline:
            for line in fline:
                self.collect_bigrams_from_string(line)

    def collect_bigrams_from_string(self, string):
        """Collect all bigrams to Markov.bigrams from a text file

        attributes:
            string (str): target string

        """
        text = [clean(word) for word in string.split()]
        for word, adj_word in zip(text[:], text[1:]):
            self.update(word, adj_word)

    def update(self, word, adj_word):
        """Update a collection of bigrams

        attributes:
            word (str): Primary word (word-node)
            adj_word (str): Adjacent word

        """
        self.bigrams[word][adj_word] += 1
        self.bigrams[word].update()

    def count_words(self, word):
        """Count the elements affiliated with a primary word

        attributes:
            word (str): Primary word

        """
        return sum((self.bigrams[word][w] for w in self.bigrams[word]))

    def create_chain(self):
        """
        Create a complete Markov.chain from Markov.bigrams
        """
        for word in self.bigrams:
            self.update_chain(word)

    def update_chain(self, word):
        """Update a single word-node on the markov chain and rebalance
        probabilities for each edge.

        attributes:
            word (str): word-node to update
        """
        if word in self.bigrams:
            total = self.count_words(word)
            cumprob = int()
            self.chain[word] = [()] * len(self.bigrams[word])
            for i, edge in enumerate(self.bigrams[word]):
                prob = self.bigrams[word][edge] / float(total)
                cumprob += prob
                self.chain[word][i] = (cumprob, edge, prob)

    def get_next_words(self, cur_word):
        """
        Step along the markov chain to produce a new word

        attributes
        cur_word -
            word-node from which it takes the step
        """
        if cur_word in self.chain:
            options = self.chain[cur_word]
            choice = random.random()
            for cumprob, adj_word, _ in options:
                if choice < cumprob:
                    return adj_word
        else:
            return None

    def get_text(self, minl=10, seed=None):
        """
        Random walk the markov chain to produce pseudo random text

        attributes:
        minl -
            minimum length of a block of text
        maxw -
            maximun length of a block of text

        seed (Optional) -
            starting word of the text block, random otherwise
        """

        init = seed or random.choice(self.chain.keys())
        text = [clean(init, first=True)]
        while True:
            new_word = self.get_next_words(text[-1])
            if new_word:
                text.append(new_word)
            elif new_word is None and len(text) < minl:
                init = seed or random.choice(self.chain.keys())
                text = [clean(init, first=True)]
                text[-1] = clean(text[-1], last=True)
            else:
                return ' '.join(text)


 def clean(string, first=None, last=None):
    """
    Cleans up nodes
    """
    if first:
        string = string.title()
    elif last:
        string = re.sub(r',', '.', string)
    else:
        string = re.sub(r'[\[\]\(\)]', '', string)
    return string
	"""
	This module contains a markov chain class that can be used to generate
	a psuedo random list of elements from a selected corpus. It does so
	by collecting all existing bigrams e.i. (element_i, element_i+1) of the
	selected corpus and defining a markov chain with transition probabilities
	proportional to the relative frequency of the bigrams.

	Author: Jason Liu
	Date: May 24th, 2014

	Language
	--------
	While a markov chain can generate elements of any kind, weather, music, etc.
	The docstrings will pertain to generating text from a corpus. We will use
	'word-node' or 'primary-word' to discuss a state, and 'adjacent-node' or
	'adjacent-word' to discuss the edges for a certain 'node'.

	The MIT License (MIT)
	Copyright (c) 2014
	"""

	import random
	from collections import defaultdict
	import regex as re

	class WordNode(object):
	pass


	class Markov(object):
	"""
	Markov Chain Pseudo Text Generator
	----------------------------------
	This class is a Text Generator that follows the Markov Property.
	It collects bigrams from text and generates a bag of adjacent words
	for each word in the text. e.g. {'eat':['eggs', 'eggs', 'spam'],...}

	By calling the makechain() method, it collects the bag and produces
	the Markov Chain. The chain is a set of possible destinations for
	each word. Each end word contains a cumilative probability, the word,
	and a relative probability. e.g. {'eat': [[0.666, 'eggs', 0.666],
	[1.0, 'spam', 0.333]]}

	Attributes
	----------
	Markov.bigrams:
	- A bag of words for each word-node.
	Markov.chain:
	- A set of words for each word-node
	with a transition probaility for each edge.
	"""

	def __init__(self):
	self.chain = {}
	self.bigrams = defaultdict(defaultdict(int))

	def __str__(self):
	return self.get_text(self)

	def collect_bigrams_from_file(self, filename):
	"""Collect all bigrams to Markov.bigrams from a text file

	attributes:
	filename (str): name of target file

	"""
	with open(filename) as fline:
	for line in fline:
	self.collect_bigrams_from_string(line)

	def collect_bigrams_from_string(self, string):
	"""Collect all bigrams to Markov.bigrams from a text file

	attributes:
	string (str): target string

	"""
	text = [clean(word) for word in string.split()]
	for word, adj_word in zip(text[:], text[1:]):
	self.update(word, adj_word)

	def update(self, word, adj_word):
	"""Update a collection of bigrams

	attributes:
	word (str): Primary word (word-node)
	adj_word (str): Adjacent word

	"""
	self.bigrams[word][adj_word] += 1
	self.bigrams[word].update()

	def count_words(self, word):
	"""Count the elements affiliated with a primary word

	attributes:
	word (str): Primary word

	"""
	return sum((self.bigrams[word][w] for w in self.bigrams[word]))

	def create_chain(self):
	"""
	Create a complete Markov.chain from Markov.bigrams
	"""
	for word in self.bigrams:
	self.update_chain(word)

	def update_chain(self, word):
	"""Update a single word-node on the markov chain and rebalance
	probabilities for each edge.

	attributes:
	word (str): word-node to update
	"""
	if word in self.bigrams:
	total = self.count_words(word)
	cumprob = int()
	self.chain[word] = [()] * len(self.bigrams[word])
	for i, edge in enumerate(self.bigrams[word]):
	prob = self.bigrams[word][edge] / float(total)
	cumprob += prob
	self.chain[word][i] = (cumprob, edge, prob)

	def get_next_words(self, cur_word):
	"""
	Step along the markov chain to produce a new word

	attributes
	cur_word -
	word-node from which it takes the step
	"""
	if cur_word in self.chain:
	options = self.chain[cur_word]
	choice = random.random()
	for cumprob, adj_word, _ in options:
	if choice < cumprob:
	return adj_word
	else:
	return None

	def get_text(self, minl=10, seed=None):
	"""
	Random walk the markov chain to produce pseudo random text

	attributes:
	minl -
	minimum length of a block of text
	maxw -
	maximun length of a block of text

	seed (Optional) -
	starting word of the text block, random otherwise
	"""

	init = seed or random.choice(self.chain.keys())
	text = [clean(init, first=True)]
	while True:
	new_word = self.get_next_words(text[-1])
	if new_word:
	text.append(new_word)
	elif new_word is None and len(text) < minl:
	init = seed or random.choice(self.chain.keys())
	text = [clean(init, first=True)]
	text[-1] = clean(text[-1], last=True)
	else:
	return ' '.join(text)


	def clean(string, first=None, last=None):
	"""
	Cleans up nodes
	"""
	if first:
	string = string.title()
	elif last:
	string = re.sub(r',', '.', string)
	else:
	string = re.sub(r'[\[\]\(\)]', '', string)
	return string