-
-
Save evanmiltenburg/cc78ffcc28557fcc6546 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# modified from: https://gist.github.com/dellis23/6174914/ | |
# - Added NLTK, which simplifies the chain and ngram logic. | |
# To use this script, you need to have downloaded the punkt | |
# data like this: | |
# | |
# import nltk | |
# nltk.download('punkt') | |
# | |
# - No more occasional KeyErrors. | |
# - Produces sentences rather than a string of N words. | |
# - Sentences now always start with a capital letter. | |
# - Changed I/O. | |
# The model now takes raw text as its input. | |
# Input text is separated from model initialization. | |
# --> As a consequence. training is also separated. | |
# | |
# Example (do this in a different file or using the python interpreter in the same directory as markov.py): | |
# | |
# from markov import Markov | |
# m = Markov() | |
# text = 'This is a text. It is just a string.\n It can be as long as you want.' | |
# # if you have a text file in the same directory, you could also do this: | |
# # with open('some_file.txt') as f: | |
# # text = f.read().decode('utf-8') | |
# m.add_text(text) | |
# m.train() | |
# generated_sentence = m.generate_markov_sentence() | |
import random, nltk | |
class Markov(object): | |
def __init__(self, chain_size=3): | |
# Increase chain length if you want to get more "typical" sentences, that are closer to the original source material. | |
# This does require a larger amount of texts. | |
self.chain_size = chain_size | |
self.cache = {} | |
self.trained = False | |
self.contains_text = False | |
self.words = [] | |
self.word_size = 0 | |
print 'Module loaded. Please add text by using model.add_text(), and then train the model by using model.train().' | |
def tokenize_text(self,text): | |
return [i for sublist in [nltk.word_tokenize(s) for s in nltk.sent_tokenize(text)] for i in sublist] | |
def add_text(self,text): | |
self.words += self.tokenize_text(text) | |
self.word_size = len(self.words) | |
self.contains_text = True | |
def remove_text(self): | |
self.words = [] | |
self.word_size = 0 | |
self.contains_text = False | |
self.trained = False | |
print "The word list is empty, and the model is again untrained." | |
def train(self): | |
self.cache = {} | |
if not self.contains_text: | |
return "Please add text using model.add_text()" | |
for chain_set in nltk.ngrams(self.words,self.chain_size): | |
key = chain_set[:-1] | |
next_word = chain_set[-1] | |
if key in self.cache: | |
self.cache[key].append(next_word) | |
else: | |
self.cache[key] = [next_word] | |
self.trained = True | |
def initial_candidates(self): | |
return [gram[:self.chain_size - 1] for gram in nltk.ngrams(self.words,self.chain_size) if gram[0][0].isupper()] | |
def generate_markov_sentence(self,limit=50): | |
if not self.contains_text: | |
return 'Please add text, and then train the model.' | |
if not self.trained: | |
return 'Please train the model first.' | |
gen_words = [] | |
seed_words = random.choice(self.initial_candidates()) | |
gen_words.extend(seed_words) | |
while(True): | |
last_word_len = self.chain_size - 1 | |
last_words = gen_words[-1 * last_word_len:] | |
next_word = random.choice(self.cache[tuple(last_words)]) | |
gen_words.append(next_word) | |
if next_word == '.': | |
return ' '.join(gen_words) | |
elif len(gen_words) > limit: | |
return ' '.join(gen_words) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment