Skip to content

Instantly share code, notes, and snippets.

@adrianseeley
Created January 31, 2013 14:48
Show Gist options
  • Save adrianseeley/4683335 to your computer and use it in GitHub Desktop.
Save adrianseeley/4683335 to your computer and use it in GitHub Desktop.
JS ngram + ngram mash
function ngram(string)
{
string = string.toString().toLowerCase();
var valid = 'abcdefghijklmnopqrstuvwxyz '; for(var s = 0; s < string.length; s++) if(valid.indexOf(string[s]) == -1) { string = string.slice(0, s) + string.slice(s + 1); s--; }
string = string.split(' ');
var ngrams = [];
var to_gram = ngram_mash(string);
to_gram.push(string);
for(var g = 0; g < to_gram.length; g++)
{
for(var l = 0; l < to_gram[g].length; l++)
{
for(var r = l; r < to_gram[g].length; r++)
{
var ngram = to_gram[g][l];
for(var lr = l + 1; lr <= r; lr++) ngram += ' ' + to_gram[g][lr];
if(ngrams.indexOf(ngram) == -1) ngrams.push(ngram);
}
}
}
console.log(ngrams);
}
function ngram_mash(split_string)
{
var done = [];
var todo = [];
todo.push(split_string);
while(todo.length > 0)
{
var current = todo[0]; todo.splice(0, 1);
for(var m = 0; m < current.length - 1; m++)
{
var building = [];
for(var c = 0; c < current.length; c++)
{
if(m == c) { building.push(current[c] + current[c + 1]); c++; }
else building.push(current[c]);
}
if(todo.indexOf(building) == -1) todo.push(building);
}
if(done.indexOf(current) == -1) done.push(current);
}
return done;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment