Last active
July 17, 2019 07:22
-
-
Save airspeedswift/6d8c9d95f0d812f58be3 to your computer and use it in GitHub Desktop.
Norvig Spellchecker in Swift
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// Translation of [Peter Norvig's spell checker](http://norvig.com/spell-correct.html) into Swift. | |
/// Sample input corpus [here](http://norvig.com/big.txt) | |
import Foundation.NSString // purely for IO, most things done with Swift.String | |
// pythony slicing | |
postfix operator ..< { } | |
prefix operator ..< { } | |
postfix func ..<<I: ForwardIndexType>(lhs: I) -> RangeStart<I> { return RangeStart(start: lhs) } | |
prefix func ..<<I: ForwardIndexType>(rhs: I) -> RangeEnd<I> { return RangeEnd(end: rhs) } | |
struct RangeStart<I: ForwardIndexType> { let start: I } | |
struct RangeEnd<I: ForwardIndexType> { let end: I } | |
extension Array { | |
subscript(r: RangeStart<Int>) -> SubSlice { return self[r.start..<self.endIndex] } | |
subscript(r: RangeEnd<Int>) -> SubSlice { return self[self.startIndex..<r.end] } | |
} | |
extension String { | |
subscript(r: RangeStart<String.Index>) -> String { return self[r.start..<self.endIndex] } | |
subscript(r: RangeEnd<String.Index>) -> String { return self[self.startIndex..<r.end] } | |
} | |
/// Return an `Array` containing the results of mapping `transform` | |
/// over `source`, filtering out results where `transform` returns `nil`. | |
func mapSome<S: SequenceType, U>(source: S, transform: S.Generator.Element -> U?) -> [U] { | |
return lazy(source).map(transform).filter { $0 != nil }.map { $0! }.array | |
} | |
/// Given a word, produce a set of possible alternatives with | |
/// letters transposed, deleted, replaced or rogue characters inserted | |
func edits(word: String) -> Set<String> { | |
if word.isEmpty { return [] } | |
let splits: [(String,String)] = indices(word).map { | |
return (word[..<$0],word[$0..<]) | |
} | |
let deletes = splits.map { $0.0 + dropFirst($0.1) } | |
let transposes: [String] = mapSome(splits) { left, right in | |
if let fst = first(right) { | |
let drop1 = dropFirst(right) | |
if let snd = first(drop1) { | |
let drop2 = dropFirst(drop1) | |
return "\(left)\(snd)\(fst)\(drop2)" | |
} | |
} | |
return nil | |
} | |
let alphabet = "abcdefghijklmnopqrstuvwxyz" | |
let replaces = splits.flatMap { left, right in | |
map(alphabet) { letter in | |
"\(left)\(letter)\(dropFirst(right))" | |
} | |
} | |
let inserts = splits.flatMap { left, right in | |
map(alphabet) { letter in | |
"\(left)\(letter)\(right)" | |
} | |
} | |
return Set(deletes + transposes + replaces + inserts) | |
} | |
struct SpellChecker { | |
var knownWords: [String:Int] = [:] | |
init?(contentsOfFile file: String) { | |
if let text = NSString(contentsOfFile: file, encoding: NSUTF8StringEncoding, error: nil)?.lowercaseString { | |
println("Loaded file \(file)") | |
// not really unicode-friendly, but splitting String directly is sloooow... | |
let words = split(text.unicodeScalars) { !("a"..."z").contains($0) }.map { String($0) } | |
println("Split string into \(words.count) words") | |
for word in words { self.train(word) } | |
println("Trained \(knownWords.count) unique words") | |
} | |
else { | |
return nil | |
} | |
} | |
mutating func train(word: String) { | |
knownWords[word] = knownWords[word]?.successor() ?? 1 | |
} | |
func knownEdits2(word: String) -> Set<String>? { | |
var known_edits: Set<String> = [] | |
for edit in edits(word) { | |
if let k = known(edits(edit)) { | |
known_edits.unionInPlace(k) | |
} | |
} | |
return known_edits.isEmpty ? nil : known_edits | |
} | |
func known<S: SequenceType where S.Generator.Element == String>(words: S) -> Set<String>? { | |
let s = Set(filter(words) { self.knownWords.indexForKey($0) != nil }) | |
return s.isEmpty ? nil : s | |
} | |
func correct(word: String) -> String { | |
let candidates = known([word]) ?? known(edits(word)) ?? knownEdits2(word) | |
return reduce(candidates ?? [], word) { | |
(knownWords[$0] ?? 1) < (knownWords[$1] ?? 1) ? $1 : $0 | |
} | |
} | |
} | |
// main() | |
if Process.argc == 2, let checker = SpellChecker(contentsOfFile: Process.arguments[1]) { | |
println("Type word to check and hit enter") | |
while let word = NSString(data: NSFileHandle.fileHandleWithStandardInput().availableData, | |
encoding: NSUTF8StringEncoding) as? String | |
where last(word) == "\n" | |
{ | |
let word = dropLast(word) | |
let checked = checker.correct(word) | |
if word == checked { | |
println("\(word) unchanged") | |
} | |
else { | |
println("\(word) -> \(checked)") | |
} | |
} | |
} | |
else { | |
println("Usage: \(Process.arguments[0]) <corpus_filename>") | |
} |
+1
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Any way you could update this for swift 3? I can't quite seem to figure out the errors