Last active
August 29, 2015 14:11
-
-
Save zzmp/4bf43e7ea507045e5eab to your computer and use it in GitHub Desktop.
Spellchecker
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This code is meant to accompany this [blog entry](http://www.garabagne.io/2014/12/15/spellchecker/). | |
// It was inspired primarily by [this article](http://www.norvig.com/spell-correct.html). | |
Spellchecker = function() { | |
/** A spellchecker. | |
* _Will suggest more common spellings of questionable words._ | |
* @class Spellchecker | |
* @param {string} [text] - space-separated text, as from a book, to use for initial training | |
* @example | |
* // someText = 'hello world' | |
* // moreText = 'goodbye world' | |
* var spellchecker = new Spellchecker(someText) | |
* spellchecker.train(moreText) | |
* var sentence = 'hillo sweet wrld'.split(' ') | |
* .map(function(word) { | |
* var candidates = spellchecker.correct(word) | |
* if (candidates.length) return candidates[0] | |
* else return word | |
* }) | |
* .join(' ') | |
* console.log(sentence) // 'hello sweet world' | |
*/ | |
function Spellchecker(text) { | |
this.dict = {} | |
if (text) this.train(text) | |
} | |
/** Train the instance with a block of space-separated text. | |
* _Training is meant to accomodate for word frequency, so a dictionary of words is not appropriate._ | |
* @param {string} dictionary - space-separated text, as from a book | |
*/ | |
Spellchecker.prototype.train = train | |
/** Return an array of possible corrections to a word. | |
* _If the word appears to be correct, an empty array will be returned._ | |
* @param {string} word - a word to correct | |
* @returns {array} candidates - the most likely corrections, lower-cased (will not include `word`) | |
*/ | |
Spellchecker.prototype.correct = correct | |
var letters = 'abcdefghijklmnopqrstuvwxyz'.split('') | |
function train(dictionary) { | |
var dict = dictionary.toLowerCase(), | |
regexp = /[a-z]+/ig, | |
words | |
while ((words = regexp.exec(dict))) | |
this.dict[words[0]] = this.dict.hasOwnProperty(words[0]) ? | |
this.dict[words[0]] + 1 : 1 | |
} | |
function correct(word) { | |
if (this.dict.hasOwnProperty(word)) return [] | |
var dict = this.dict, | |
score = 0 | |
return edits(word) | |
.filter(function(candidate) { | |
if (dict.hasOwnProperty(candidate)) { | |
score = Math.max(score, dict[candidate]) | |
return true | |
} | |
}) | |
.filter(function(candidate) { | |
return dict[candidate] === score | |
}) | |
} | |
/** Exploratory function to edit a word. | |
* _Edits can include:_ | |
* - _deletion_ | |
* - _transposition_ | |
* - _insertion_ | |
* - _alteration_ | |
* @param {string} word - the word to explore. | |
* @returns {array} words - a new problem-space of word edits. | |
*/ | |
function edits(word) { | |
var words = [] | |
// Assume UTF-8 (i.e. can be simply iterated) | |
for (var i = 0; i < word.length; i++) { | |
var pre = word.slice(0, i), | |
post = word.slice(i), | |
rest = word.slice(i + 1) | |
// deletion - remove the current letter | |
words.push(pre + rest) | |
// transposition - shift the current letter | |
words.push(pre, rest.slice(0, 1), word[i], rest.slice(1)) | |
letters.forEach(function(letter) { | |
// insertion - insert a 'missing' letter | |
words.push(pre + letter + post) | |
// alteration - fix a 'mistyped' letter | |
words.push(pre + letter + rest) | |
}) | |
} | |
return words | |
} | |
return Spellchecker | |
}() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment