Last active
August 29, 2015 14:04
-
-
Save JnBrymn/76100614a6500e193cff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* MarkovModel creates a Markov model of text (or tokens) and allow you to generate new | |
* text from the model. It takes two optional arguments: | |
* | |
* tokenizer - a function that takes a string and returns an array of tokens | |
* defaults to a tokenizer that breaks on whitespace and lowercases everything | |
* shingle_n - the number of tokens that make up a state in the markov model | |
* the higher the number the more realistic the generated data, but the more | |
* training data required | |
* defaults to 1 | |
* join_str - string used to join text together | |
* defaults to space | |
*/ | |
function MarkovModel(tokenizer,shingle_n,join_str) { | |
this.shingle_n = shingle_n || 1; | |
this.tokenizer = tokenizer || function(text) { | |
return text.toLowerCase().split(/\s/); | |
}; | |
this.join_str = join_str || " "; | |
//Consider trying this too | |
// this.shingle_n = shingle_n || 2; | |
// this.tokenizer = tokenizer || function(text) { | |
// return text.toLowerCase().split(/\b/); | |
// }; | |
this.delimiter = "\u0037"; | |
this.start_token = "\u0002"; | |
this.end_token = "\u0003"; | |
this._shift_key = function(current_key,next_token) { | |
var tokens = current_key.split(this.delimiter); | |
tokens.push(next_token); | |
while (tokens.length > this.shingle_n) { | |
tokens = tokens.slice(1,tokens.length); | |
} | |
return tokens.join(this.delimiter); | |
}; | |
this.model = {}; | |
} | |
MarkovModel.prototype.addSample = function(text) { | |
if (typeof text == "string") { | |
text = this.tokenizer(text); | |
} | |
text.push(this.end_token); | |
var new_key_struct = function() { | |
return {count:0, tokens_and_counts:{}}; | |
}; | |
key = this.start_token; | |
for (var i = 0; i < text.length; i++) { | |
this.model[key] = this.model[key] || new_key_struct();//make sure it's been initialized | |
this.model[key].count ++; | |
var token = text[i]; | |
this.model[key].tokens_and_counts[token] = this.model[key].tokens_and_counts[token] || 0;//make sure it's been initialized | |
this.model[key].tokens_and_counts[token]++; | |
key = this._shift_key(key,token); | |
} | |
}; | |
MarkovModel.prototype.generateText = function(max_len) { | |
max_len = max_len || 100; | |
var tokens = []; | |
key = this.start_token; | |
for (var i = 0; i < max_len; i++) { | |
var sub_model = this.model[key]; | |
//here I pick a work with probability based upon how common the token was for the given key | |
until = Math.floor(Math.random()*sub_model.count); | |
for (var token in sub_model.tokens_and_counts) { | |
until -= sub_model.tokens_and_counts[token]; //subtract the count for this token | |
if (until <= 0) { | |
if (token == this.end_token) { | |
//then we've reached the end of a sentence | |
return tokens.join(this.join_str); | |
} | |
tokens.push(token); | |
key = this._shift_key(key,token); | |
break; | |
} | |
} | |
} | |
return tokens.join(this.join_str); | |
}; | |
//usage | |
var mm = new MarkovModel(); | |
for (var x in hits) { | |
mm.addSample(getSentenceFromSomewhere()); | |
} | |
console.log(mm.generateText()); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment