Last active
June 21, 2024 07:57
-
-
Save airspeedswift/1cd9cd1607de104ecf7bbfb80865c8bd to your computer and use it in GitHub Desktop.
A simple spelling corrector in Swift
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Swift version of the spelling checker described in http://norvig.com/spell-correct.html | |
import Foundation | |
let alphabet = "abcdefghijklmnopqrstuvwxyz" | |
/// Given a word, produce a set of possible edits with one character | |
/// transposed, deleted, replaced or a rogue character inserted | |
func edits(_ word: String) -> Set<String> { | |
// create an array of the word split into pairs at each possible point | |
// including an empty string at the start | |
var splits = word.indices.map { (word[..<$0], word[$0...]) } | |
// and the empty string at the end | |
splits.append((word[...],"")) | |
// delete the first character of the right part | |
let deletes = splits.map { left, right in | |
"\(left)\(right.dropFirst())" | |
} | |
// transpose the first and second character of the right part | |
let transposed = splits.compactMap { left, right in | |
if let first = right.first, let second = right.dropFirst().first { | |
"\(left)\(second)\(first)\(right.dropFirst(2))" | |
} else { | |
nil | |
} | |
} | |
// replace the first character of the right with every letter | |
let replaced = splits.flatMap { left, right in | |
alphabet.map { "\(left)\($0)\(right.dropFirst())" } | |
} | |
// insert every letter between left and right | |
let inserted = splits.flatMap { left, right in | |
alphabet.map { "\(left)\($0)\(right)" } | |
} | |
return Set(deletes + transposed + replaced + inserted) | |
} | |
struct SpellChecker { | |
var knownWords: [String:Int] = [:] | |
init(from file: String) throws { | |
let contents = try String(contentsOfFile: file, encoding: .utf8) | |
for word in contents.split(whereSeparator: \.isWhitespace) { | |
// bump known words | |
knownWords[word.lowercased(), default: 0] += 1 | |
} | |
print("Trained \(knownWords.count) unique words") | |
} | |
func known(words: Set<String>) -> [String:Int] { | |
knownWords.filter { words.contains($0.key) } | |
} | |
func best(for words: Set<String>) -> String? { | |
known(words: words).max(by: { $0.value < $1.value })?.key | |
} | |
func correct(word: String) -> String? { | |
if knownWords.keys.contains(word) { return word } | |
let edits1 = edits(word) | |
if let best1 = best(for: edits1) { | |
return best1 | |
} | |
let edits2 = Set(edits1.flatMap(edits)) | |
if let best2 = best(for: edits2) { | |
return best2 | |
} | |
return nil | |
} | |
} | |
// the one argument to this is a big corpus of text to train the checker | |
// such as http://norvig.com/big.txt | |
let checker = try! SpellChecker(from: CommandLine.arguments[1]) | |
while let word = readLine() { | |
let response = | |
switch checker.correct(word: word) { | |
case word?: "Correct!" | |
case let corrected?: "Corrected: \(corrected)" | |
case nil: "Not found." | |
} | |
print(response) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment