JnBrymn · August 29, 2015 14:04
diff --git a/MarkovModel.js b/MarkovModel.js
 /* MarkovModel creates a Markov model of text (or tokens) and allow you to generate new
 * text from the model. It takes two optional arguments:
 *
 * tokenizer - a function that takes a string and returns an array of tokens
 *   defaults to a tokenizer that breaks on whitespace and lowercases everything
 * shingle_n - the number of tokens that make up a state in the markov model
 *   the higher the number the more realistic the generated data, but the more 
 *   training data required
 *   defaults to 1
 * join_str - string used to join text together
 *   defaults to space
 */
 function MarkovModel(tokenizer,shingle_n,join_str) {
  this.shingle_n = shingle_n || 1;
  this.tokenizer = tokenizer || function(text) {
    return text.toLowerCase().split(/\s/);
  };
  this.join_str = join_str || " ";

 //Consider trying this too
 //  this.shingle_n = shingle_n || 2;
 //  this.tokenizer =  tokenizer || function(text) {
 //    return text.toLowerCase().split(/\b/);
 //  };

  this.delimiter = "\u0037";
  this.start_token = "\u0002";
  this.end_token = "\u0003";

  this._shift_key = function(current_key,next_token) {
    var tokens = current_key.split(this.delimiter);
    tokens.push(next_token);
    while (tokens.length > this.shingle_n) {
      tokens = tokens.slice(1,tokens.length);
    }
    return tokens.join(this.delimiter);
  };

  this.model = {};
 }

 MarkovModel.prototype.addSample = function(text) {
  if (typeof text == "string") {
    text = this.tokenizer(text);
  }
  text.push(this.end_token);

  var new_key_struct = function() {
    return {count:0, tokens_and_counts:{}};
  };

  key = this.start_token;
  for (var i = 0; i < text.length; i++) {
    this.model[key] = this.model[key] || new_key_struct();//make sure it's been initialized
    this.model[key].count ++;
    var token = text[i];
    this.model[key].tokens_and_counts[token] = this.model[key].tokens_and_counts[token] || 0;//make sure it's been initialized
    this.model[key].tokens_and_counts[token]++;
    key = this._shift_key(key,token);
  }
 };

 MarkovModel.prototype.generateText = function(max_len) {
  max_len = max_len || 100;
  var tokens = [];
  key = this.start_token;
  for (var i = 0; i < max_len; i++) {
    var sub_model = this.model[key];
    //here I pick a work with probability based upon how common the token was for the given key
    until = Math.floor(Math.random()*sub_model.count);
    for (var token in sub_model.tokens_and_counts) {
      until -= sub_model.tokens_and_counts[token]; //subtract the count for this token
      if (until <= 0) {
        if (token == this.end_token) {
          //then we've reached the end of a sentence
          return tokens.join(this.join_str);
        }
        tokens.push(token);
        key = this._shift_key(key,token);
        break;
      }
    }
  }
  return tokens.join(this.join_str);
 };

 //usage
 var mm = new MarkovModel();
 for (var x in hits) {
  mm.addSample(getSentenceFromSomewhere());
 }
 console.log(mm.generateText());
	/* MarkovModel creates a Markov model of text (or tokens) and allow you to generate new
	* text from the model. It takes two optional arguments:
	*
	* tokenizer - a function that takes a string and returns an array of tokens
	* defaults to a tokenizer that breaks on whitespace and lowercases everything
	* shingle_n - the number of tokens that make up a state in the markov model
	* the higher the number the more realistic the generated data, but the more
	* training data required
	* defaults to 1
	* join_str - string used to join text together
	* defaults to space
	*/
	function MarkovModel(tokenizer,shingle_n,join_str) {
	this.shingle_n = shingle_n \|\| 1;
	this.tokenizer = tokenizer \|\| function(text) {
	return text.toLowerCase().split(/\s/);
	};
	this.join_str = join_str \|\| " ";

	//Consider trying this too
	// this.shingle_n = shingle_n \|\| 2;
	// this.tokenizer = tokenizer \|\| function(text) {
	// return text.toLowerCase().split(/\b/);
	// };

	this.delimiter = "\u0037";
	this.start_token = "\u0002";
	this.end_token = "\u0003";

	this._shift_key = function(current_key,next_token) {
	var tokens = current_key.split(this.delimiter);
	tokens.push(next_token);
	while (tokens.length > this.shingle_n) {
	tokens = tokens.slice(1,tokens.length);
	}
	return tokens.join(this.delimiter);
	};

	this.model = {};
	}

	MarkovModel.prototype.addSample = function(text) {
	if (typeof text == "string") {
	text = this.tokenizer(text);
	}
	text.push(this.end_token);

	var new_key_struct = function() {
	return {count:0, tokens_and_counts:{}};
	};

	key = this.start_token;
	for (var i = 0; i < text.length; i++) {
	this.model[key] = this.model[key] \|\| new_key_struct();//make sure it's been initialized
	this.model[key].count ++;
	var token = text[i];
	this.model[key].tokens_and_counts[token] = this.model[key].tokens_and_counts[token] \|\| 0;//make sure it's been initialized
	this.model[key].tokens_and_counts[token]++;
	key = this._shift_key(key,token);
	}
	};

	MarkovModel.prototype.generateText = function(max_len) {
	max_len = max_len \|\| 100;
	var tokens = [];
	key = this.start_token;
	for (var i = 0; i < max_len; i++) {
	var sub_model = this.model[key];
	//here I pick a work with probability based upon how common the token was for the given key
	until = Math.floor(Math.random()*sub_model.count);
	for (var token in sub_model.tokens_and_counts) {
	until -= sub_model.tokens_and_counts[token]; //subtract the count for this token
	if (until <= 0) {
	if (token == this.end_token) {
	//then we've reached the end of a sentence
	return tokens.join(this.join_str);
	}
	tokens.push(token);
	key = this._shift_key(key,token);
	break;
	}
	}
	}
	return tokens.join(this.join_str);
	};

	//usage
	var mm = new MarkovModel();
	for (var x in hits) {
	mm.addSample(getSentenceFromSomewhere());
	}
	console.log(mm.generateText());