Created
September 23, 2014 00:05
-
-
Save jxnl/c062b183d603e84fd5c2 to your computer and use it in GitHub Desktop.
Generates Parody Text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| This module contains a markov chain class that can be used to generate | |
| a psuedo random list of elements from a selected corpus. It does so | |
| by collecting all existing bigrams e.i. (element_i, element_i+1) of the | |
| selected corpus and defining a markov chain with transition probabilities | |
| proportional to the relative frequency of the bigrams. | |
| Author: Jason Liu | |
| Date: May 24th, 2014 | |
| Language | |
| -------- | |
| While a markov chain can generate elements of any kind, weather, music, etc. | |
| The docstrings will pertain to generating text from a corpus. We will use | |
| 'word-node' or 'primary-word' to discuss a state, and 'adjacent-node' or | |
| 'adjacent-word' to discuss the edges for a certain 'node'. | |
| The MIT License (MIT) | |
| Copyright (c) 2014 | |
| """ | |
| import random | |
| from collections import defaultdict | |
| import regex as re | |
| class WordNode(object): | |
| pass | |
| class Markov(object): | |
| """ | |
| Markov Chain Pseudo Text Generator | |
| ---------------------------------- | |
| This class is a Text Generator that follows the Markov Property. | |
| It collects bigrams from text and generates a bag of adjacent words | |
| for each word in the text. e.g. {'eat':['eggs', 'eggs', 'spam'],...} | |
| By calling the makechain() method, it collects the bag and produces | |
| the Markov Chain. The chain is a set of possible destinations for | |
| each word. Each end word contains a cumilative probability, the word, | |
| and a relative probability. e.g. {'eat': [[0.666, 'eggs', 0.666], | |
| [1.0, 'spam', 0.333]]} | |
| Attributes | |
| ---------- | |
| Markov.bigrams: | |
| - A bag of words for each word-node. | |
| Markov.chain: | |
| - A set of words for each word-node | |
| with a transition probaility for each edge. | |
| """ | |
| def __init__(self): | |
| self.chain = {} | |
| self.bigrams = defaultdict(defaultdict(int)) | |
| def __str__(self): | |
| return self.get_text(self) | |
| def collect_bigrams_from_file(self, filename): | |
| """Collect all bigrams to Markov.bigrams from a text file | |
| attributes: | |
| filename (str): name of target file | |
| """ | |
| with open(filename) as fline: | |
| for line in fline: | |
| self.collect_bigrams_from_string(line) | |
| def collect_bigrams_from_string(self, string): | |
| """Collect all bigrams to Markov.bigrams from a text file | |
| attributes: | |
| string (str): target string | |
| """ | |
| text = [clean(word) for word in string.split()] | |
| for word, adj_word in zip(text[:], text[1:]): | |
| self.update(word, adj_word) | |
| def update(self, word, adj_word): | |
| """Update a collection of bigrams | |
| attributes: | |
| word (str): Primary word (word-node) | |
| adj_word (str): Adjacent word | |
| """ | |
| self.bigrams[word][adj_word] += 1 | |
| self.bigrams[word].update() | |
| def count_words(self, word): | |
| """Count the elements affiliated with a primary word | |
| attributes: | |
| word (str): Primary word | |
| """ | |
| return sum((self.bigrams[word][w] for w in self.bigrams[word])) | |
| def create_chain(self): | |
| """ | |
| Create a complete Markov.chain from Markov.bigrams | |
| """ | |
| for word in self.bigrams: | |
| self.update_chain(word) | |
| def update_chain(self, word): | |
| """Update a single word-node on the markov chain and rebalance | |
| probabilities for each edge. | |
| attributes: | |
| word (str): word-node to update | |
| """ | |
| if word in self.bigrams: | |
| total = self.count_words(word) | |
| cumprob = int() | |
| self.chain[word] = [()] * len(self.bigrams[word]) | |
| for i, edge in enumerate(self.bigrams[word]): | |
| prob = self.bigrams[word][edge] / float(total) | |
| cumprob += prob | |
| self.chain[word][i] = (cumprob, edge, prob) | |
| def get_next_words(self, cur_word): | |
| """ | |
| Step along the markov chain to produce a new word | |
| attributes | |
| cur_word - | |
| word-node from which it takes the step | |
| """ | |
| if cur_word in self.chain: | |
| options = self.chain[cur_word] | |
| choice = random.random() | |
| for cumprob, adj_word, _ in options: | |
| if choice < cumprob: | |
| return adj_word | |
| else: | |
| return None | |
| def get_text(self, minl=10, seed=None): | |
| """ | |
| Random walk the markov chain to produce pseudo random text | |
| attributes: | |
| minl - | |
| minimum length of a block of text | |
| maxw - | |
| maximun length of a block of text | |
| seed (Optional) - | |
| starting word of the text block, random otherwise | |
| """ | |
| init = seed or random.choice(self.chain.keys()) | |
| text = [clean(init, first=True)] | |
| while True: | |
| new_word = self.get_next_words(text[-1]) | |
| if new_word: | |
| text.append(new_word) | |
| elif new_word is None and len(text) < minl: | |
| init = seed or random.choice(self.chain.keys()) | |
| text = [clean(init, first=True)] | |
| text[-1] = clean(text[-1], last=True) | |
| else: | |
| return ' '.join(text) | |
| def clean(string, first=None, last=None): | |
| """ | |
| Cleans up nodes | |
| """ | |
| if first: | |
| string = string.title() | |
| elif last: | |
| string = re.sub(r',', '.', string) | |
| else: | |
| string = re.sub(r'[\[\]\(\)]', '', string) | |
| return string |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
do not use this code