Created
July 15, 2015 11:13
-
-
Save thedmitriyk/a70519d43c12108ad0d5 to your computer and use it in GitHub Desktop.
A naïve Scala implementation of Peter Norvig's Spelling Corrector from http://norvig.com/spell-correct.html Optimized for neither readability nor brevity nor performance.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object SpellCheck { | |
val alpha = "abcdefghijklmnopqrstuvwxyz" | |
val model = scala.io.Source.fromFile("big.txt") | |
.getLines() | |
.flatMap(_.toLowerCase.split("\\W+")) | |
.foldLeft(Map.empty[String, Int].withDefaultValue(1)) { (acc, word) => | |
acc + (word -> (acc.getOrElse(word, 0) + 1)) | |
} | |
def known(words: Set[String]) = words.intersect(model.keySet) | |
def edits1(word: String) = { | |
val lcWord = word.toLowerCase | |
val splits = lcWord.inits.map(i => (i, lcWord.drop(i.length))).toSet | |
splits.collect { // deletes | |
case (a, b) if b.length > 0 => a + b.drop(1) | |
} ++ splits.collect { // transposes | |
case (a, b) if b.length > 1 => a + b.take(2).reverse + b.drop(2) | |
} ++ splits.flatMap { // inserts | |
case (a, b) => alpha.map(a + _ + b) | |
} ++ splits.collect { // replaces | |
case (a, b) if b.length > 0 => alpha.map(a + _ + b.drop(1)) | |
}.flatten | |
} | |
def known_edits2(word: String) = known(edits1(word).flatMap(edits1)) | |
def apply(word: String) = { | |
known(Set(word)).headOption | |
.orElse(known(edits1(word)).toSeq.sortBy(model).lastOption) | |
.orElse(known_edits2(word).toSeq.sortBy(model).lastOption) | |
.getOrElse(word) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment