evanmiltenburg · August 29, 2015 14:07
diff --git a/markov.py b/markov.py
 # modified from: https://gist.github.com/dellis23/6174914/

 # - Added NLTK, which simplifies the chain and ngram logic.
 #   To use this script, you need to have downloaded the punkt
 #   data like this:
 #
 #   import nltk
 #   nltk.download('punkt')
 #
 # - No more occasional KeyErrors.
 # - Produces sentences rather than a string of N words.
 # - Sentences now always start with a capital letter.
 # - Changed I/O.
 #	The model now takes raw text as its input.
 #	Input text is separated from model initialization.
 #	--> As a consequence. training is also separated.
 #
 #   Example (do this in a different file or using the python interpreter in the same directory as markov.py):
 #
 #   from markov import Markov
 #   m = Markov()
 #   text = 'This is a text. It is just a string.\n It can be as long as you want.'
 #   # if you have a text file in the same directory, you could also do this:
 #   # with open('some_file.txt') as f:
 #   #   text = f.read().decode('utf-8')
 #   m.add_text(text)
 #   m.train()
 #   generated_sentence = m.generate_markov_sentence()

 import random, nltk

 class Markov(object):
    
    def __init__(self, chain_size=3):
        # Increase chain length if you want to get more "typical" sentences, that are closer to the original source material.
        # This does require a larger amount of texts.
        self.chain_size = chain_size
        self.cache = {}
        self.trained = False
        self.contains_text = False
        self.words = []
        self.word_size = 0
        print 'Module loaded. Please add text by using model.add_text(), and then train the model by using model.train().'
    
    def tokenize_text(self,text):
        return [i for sublist in [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)] for i in sublist]
    
    def add_text(self,text):
        self.words += self.tokenize_text(text)
        self.word_size = len(self.words)
        self.contains_text = True
    
    def remove_text(self):
        self.words = []
        self.word_size = 0
        self.contains_text = False
        self.trained = False
        print "The word list is empty, and the model is again untrained."
    
    def train(self):
        self.cache = {}
        if not self.contains_text:
            return "Please add text using model.add_text()"
        for chain_set in nltk.ngrams(self.words,self.chain_size):
            key = chain_set[:-1]
            next_word = chain_set[-1]
            if key in self.cache:
                self.cache[key].append(next_word)
            else:
                self.cache[key] = [next_word]
        self.trained = True
    
    def initial_candidates(self):
        return [gram[:self.chain_size - 1] for gram in nltk.ngrams(self.words,self.chain_size) if gram[0][0].isupper()]
    
    def generate_markov_sentence(self,limit=50):
        if not self.contains_text:
            return 'Please add text, and then train the model.'
        if not self.trained:
            return 'Please train the model first.'
        gen_words = []
        seed_words = random.choice(self.initial_candidates())
        gen_words.extend(seed_words)
        while(True):
            last_word_len = self.chain_size - 1
            last_words = gen_words[-1 * last_word_len:]
            next_word = random.choice(self.cache[tuple(last_words)])
            gen_words.append(next_word)
            if next_word == '.':
                return ' '.join(gen_words)
            elif len(gen_words) > limit:
                return ' '.join(gen_words)
	# modified from: https://gist.github.com/dellis23/6174914/

	# - Added NLTK, which simplifies the chain and ngram logic.
	# To use this script, you need to have downloaded the punkt
	# data like this:
	#
	# import nltk
	# nltk.download('punkt')
	#
	# - No more occasional KeyErrors.
	# - Produces sentences rather than a string of N words.
	# - Sentences now always start with a capital letter.
	# - Changed I/O.
	# The model now takes raw text as its input.
	# Input text is separated from model initialization.
	# --> As a consequence. training is also separated.
	#
	# Example (do this in a different file or using the python interpreter in the same directory as markov.py):
	#
	# from markov import Markov
	# m = Markov()
	# text = 'This is a text. It is just a string.\n It can be as long as you want.'
	# # if you have a text file in the same directory, you could also do this:
	# # with open('some_file.txt') as f:
	# # text = f.read().decode('utf-8')
	# m.add_text(text)
	# m.train()
	# generated_sentence = m.generate_markov_sentence()

	import random, nltk

	class Markov(object):

	def __init__(self, chain_size=3):
	# Increase chain length if you want to get more "typical" sentences, that are closer to the original source material.
	# This does require a larger amount of texts.
	self.chain_size = chain_size
	self.cache = {}
	self.trained = False
	self.contains_text = False
	self.words = []
	self.word_size = 0
	print 'Module loaded. Please add text by using model.add_text(), and then train the model by using model.train().'

	def tokenize_text(self,text):
	return [i for sublist in [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)] for i in sublist]

	def add_text(self,text):
	self.words += self.tokenize_text(text)
	self.word_size = len(self.words)
	self.contains_text = True

	def remove_text(self):
	self.words = []
	self.word_size = 0
	self.contains_text = False
	self.trained = False
	print "The word list is empty, and the model is again untrained."

	def train(self):
	self.cache = {}
	if not self.contains_text:
	return "Please add text using model.add_text()"
	for chain_set in nltk.ngrams(self.words,self.chain_size):
	key = chain_set[:-1]
	next_word = chain_set[-1]
	if key in self.cache:
	self.cache[key].append(next_word)
	else:
	self.cache[key] = [next_word]
	self.trained = True

	def initial_candidates(self):
	return [gram[:self.chain_size - 1] for gram in nltk.ngrams(self.words,self.chain_size) if gram[0][0].isupper()]

	def generate_markov_sentence(self,limit=50):
	if not self.contains_text:
	return 'Please add text, and then train the model.'
	if not self.trained:
	return 'Please train the model first.'
	gen_words = []
	seed_words = random.choice(self.initial_candidates())
	gen_words.extend(seed_words)
	while(True):
	last_word_len = self.chain_size - 1
	last_words = gen_words[-1 * last_word_len:]
	next_word = random.choice(self.cache[tuple(last_words)])
	gen_words.append(next_word)
	if next_word == '.':
	return ' '.join(gen_words)
	elif len(gen_words) > limit:
	return ' '.join(gen_words)