Last active
May 30, 2019 20:51
-
-
Save shaulhameed/568d2ecdf3a69657b29e40c8be949538 to your computer and use it in GitHub Desktop.
Auto correction implementation for swift based on http://norvig.com/spell-correct.html. Improved the search performance using "trie" data structure.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Trie.swift | |
// AutoCorrect | |
// | |
// Created by Shaul Hameed on 30/07/17. | |
// | |
// | |
import Foundation | |
/// A node in the trie | |
class TrieNode<T: Hashable> { | |
var value: T? | |
weak var parentNode: TrieNode? | |
var children: [T: TrieNode] = [:] | |
var isTerminating = false | |
var isLeaf: Bool { | |
return children.count == 0 | |
} | |
/// Initializes a node. | |
/// | |
/// - Parameters: | |
/// - value: The value that goes into the node | |
/// - parentNode: A reference to this node's parent | |
init(value: T? = nil, parentNode: TrieNode? = nil) { | |
self.value = value | |
self.parentNode = parentNode | |
} | |
/// Adds a child node to self. If the child is already present, | |
/// do nothing. | |
/// | |
/// - Parameter value: The item to be added to this node. | |
func add(value: T) { | |
guard children[value] == nil else { | |
return | |
} | |
children[value] = TrieNode(value: value, parentNode: self) | |
} | |
} | |
/// A trie data structure containing words. Each node is a single | |
/// character of a word. | |
class Trie: NSObject, NSCoding { | |
typealias Node = TrieNode<Character> | |
/// The number of words in the trie | |
public var count: Int { | |
return wordCount | |
} | |
/// Is the trie empty? | |
public var isEmpty: Bool { | |
return wordCount == 0 | |
} | |
/// All words currently in the trie | |
public var words: [String] { | |
return wordsInSubtrie(rootNode: root, partialWord: "") | |
} | |
fileprivate let root: Node | |
fileprivate var wordCount: Int | |
/// Creates an empty trie. | |
override init() { | |
root = Node() | |
wordCount = 0 | |
super.init() | |
} | |
// MARK: NSCoding | |
/// Initializes the trie with words from an archive | |
/// | |
/// - Parameter decoder: Decodes the archive | |
required convenience init?(coder decoder: NSCoder) { | |
self.init() | |
let words = decoder.decodeObject(forKey: "words") as? [String] | |
for word in words! { | |
self.insert(word: word) | |
} | |
} | |
/// Encodes the words in the trie by putting them in an array then encoding | |
/// the array. | |
/// | |
/// - Parameter coder: The object that will encode the array | |
func encode(with coder: NSCoder) { | |
coder.encode(self.words, forKey: "words") | |
} | |
} | |
// MARK: - Adds methods: insert, remove, contains | |
extension Trie { | |
/// Inserts a word into the trie. If the word is already present, | |
/// there is no change. | |
/// | |
/// - Parameter word: the word to be inserted. | |
func insert(word: String) { | |
guard !word.isEmpty else { | |
return | |
} | |
var currentNode = root | |
for character in word.lowercased().characters { | |
if let childNode = currentNode.children[character] { | |
currentNode = childNode | |
} else { | |
currentNode.add(value: character) | |
currentNode = currentNode.children[character]! | |
} | |
} | |
// Word already present? | |
guard !currentNode.isTerminating else { | |
return | |
} | |
wordCount += 1 | |
currentNode.isTerminating = true | |
} | |
/// Determines whether a word is in the trie. | |
/// | |
/// - Parameter word: the word to check for | |
/// - Returns: true if the word is present, false otherwise. | |
func contains(word: String) -> Bool { | |
guard !word.isEmpty else { | |
return false | |
} | |
var currentNode = root | |
for character in word.lowercased().characters { | |
guard let childNode = currentNode.children[character] else { | |
return false | |
} | |
currentNode = childNode | |
} | |
return currentNode.isTerminating | |
} | |
/// Attempts to walk to the last node of a word. The | |
/// search will fail if the word is not present. Doesn't | |
/// check if the node is terminating | |
/// | |
/// - Parameter word: the word in question | |
/// - Returns: the node where the search ended, nil if the | |
/// search failed. | |
private func findLastNodeOf(word: String) -> Node? { | |
var currentNode = root | |
for character in word.lowercased().characters { | |
guard let childNode = currentNode.children[character] else { | |
return nil | |
} | |
currentNode = childNode | |
} | |
return currentNode | |
} | |
/// Attempts to walk to the terminating node of a word. The | |
/// search will fail if the word is not present. | |
/// | |
/// - Parameter word: the word in question | |
/// - Returns: the node where the search ended, nil if the | |
/// search failed. | |
private func findTerminalNodeOf(word: String) -> Node? { | |
if let lastNode = findLastNodeOf(word: word) { | |
return lastNode.isTerminating ? lastNode : nil | |
} | |
return nil | |
} | |
/// Deletes a word from the trie by starting with the last letter | |
/// and moving back, deleting nodes until either a non-leaf or a | |
/// terminating node is found. | |
/// | |
/// - Parameter terminalNode: the node representing the last node | |
/// of a word | |
private func deleteNodesForWordEndingWith(terminalNode: Node) { | |
var lastNode = terminalNode | |
var character = lastNode.value | |
while lastNode.isLeaf, let parentNode = lastNode.parentNode { | |
lastNode = parentNode | |
lastNode.children[character!] = nil | |
character = lastNode.value | |
if lastNode.isTerminating { | |
break | |
} | |
} | |
} | |
/// Removes a word from the trie. If the word is not present or | |
/// it is empty, just ignore it. If the last node is a leaf, | |
/// delete that node and higher nodes that are leaves until a | |
/// terminating node or non-leaf is found. If the last node of | |
/// the word has more children, the word is part of other words. | |
/// Mark the last node as non-terminating. | |
/// | |
/// - Parameter word: the word to be removed | |
func remove(word: String) { | |
guard !word.isEmpty else { | |
return | |
} | |
guard let terminalNode = findTerminalNodeOf(word: word) else { | |
return | |
} | |
if terminalNode.isLeaf { | |
deleteNodesForWordEndingWith(terminalNode: terminalNode) | |
} else { | |
terminalNode.isTerminating = false | |
} | |
wordCount -= 1 | |
} | |
/// Returns an array of words in a subtrie of the trie | |
/// | |
/// - Parameters: | |
/// - rootNode: the root node of the subtrie | |
/// - partialWord: the letters collected by traversing to this node | |
/// - Returns: the words in the subtrie | |
fileprivate func wordsInSubtrie(rootNode: Node, partialWord: String) -> [String] { | |
var subtrieWords = [String]() | |
var previousLetters = partialWord | |
if let value = rootNode.value { | |
previousLetters.append(value) | |
} | |
if rootNode.isTerminating { | |
subtrieWords.append(previousLetters) | |
} | |
for childNode in rootNode.children.values { | |
let childWords = wordsInSubtrie(rootNode: childNode, partialWord: previousLetters) | |
subtrieWords += childWords | |
} | |
return subtrieWords | |
} | |
/// Returns an array of words in a subtrie of the trie that start | |
/// with given prefix | |
/// | |
/// - Parameters: | |
/// - prefix: the letters for word prefix | |
/// - Returns: the words in the subtrie that start with prefix | |
func findWordsWithPrefix(prefix: String) -> [String] { | |
var words = [String]() | |
let prefixLowerCased = prefix.lowercased() | |
if let lastNode = findLastNodeOf(word: prefixLowerCased) { | |
if lastNode.isTerminating { | |
words.append(prefixLowerCased) | |
} | |
for childNode in lastNode.children.values { | |
let childWords = wordsInSubtrie(rootNode: childNode, partialWord: prefixLowerCased) | |
words += childWords | |
} | |
} | |
return words | |
} | |
} | |
// | |
// Prediction.swift | |
// AutoCorrect | |
// | |
// Created by Shaul Hameed on 25/07/17. | |
// | |
// | |
import UIKit | |
extension String { | |
var length: Int { | |
return self.characters.count | |
} | |
subscript (i: Int) -> String { | |
return self[Range(i ..< i + 1)] | |
} | |
func substring(from: Int) -> String { | |
return self[Range(min(from, length) ..< length)] | |
} | |
func substring(to: Int) -> String { | |
return self[Range(0 ..< max(0, to))] | |
} | |
subscript (r: Range<Int>) -> String { | |
let range = Range(uncheckedBounds: (lower: max(0, min(length, r.lowerBound)), | |
upper: min(length, max(0, r.upperBound)))) | |
let start = index(startIndex, offsetBy: range.lowerBound) | |
let end = index(start, offsetBy: range.upperBound - range.lowerBound) | |
return self[Range(start ..< end)] | |
} | |
} | |
// Todo: Have to implement trie algorithm for correction and prediction | |
class Prediction: NSObject { | |
private var _words: [String]? | |
private var FilePath:URL? | |
public static var sharedInstance = Prediction() | |
private let letters = "abcdefghijklmnopqrstuvwxyz" | |
private var trie: Trie = Trie() | |
//Read only data. | |
var dictionary:[String]{ | |
get { | |
guard let words = self._words else { | |
return self.readFromDictionary() | |
} | |
return words | |
} | |
} | |
override init() { | |
super.init() | |
// sets up the word from the library. | |
// we are using a temprory library right now. | |
// will be using users typed in characters to make the results more accurate. | |
self._words = self.readFromDictionary() | |
self.populateTrie(words: self.readFromDictionary()) | |
} | |
private func populateTrie(words: [String]) { | |
for (_, word) in words.enumerated() { | |
self.trie.insert(word: word) | |
} | |
} | |
// reads all the data from the dictionary. | |
private func readFromDictionary() -> [String]{ | |
//Begin reading the data from a dictionary. | |
if let directory = Bundle.main.url(forResource: "Dictionary", withExtension: "txt"){ | |
self.FilePath = directory | |
do { | |
let wordAsString = try String(contentsOf: self.FilePath!, encoding: String.Encoding.utf8) | |
return self.extractWords(source: wordAsString) | |
} | |
catch _ { | |
// todo - write an custom error handler. | |
return [String]() | |
} | |
} | |
return [String]() | |
} | |
// returns all the words from the list. | |
private func extractWords(source: String) -> [String] { | |
let PATTERN = "\\w+" | |
do { | |
let regex = try NSRegularExpression(pattern: PATTERN, options: .caseInsensitive) | |
var nsSource:NSString = NSString(string: source.lowercased()) | |
// This will sanitize the new line and whitespaces. | |
// We are uncerntain about the source. | |
nsSource = nsSource.components(separatedBy: NSCharacterSet.whitespacesAndNewlines).joined(separator: " ") as NSString | |
let searchRange = NSMakeRange(0, nsSource.length) | |
let words = regex.matches(in: nsSource as String, options: .reportCompletion, range: searchRange) | |
return words.map { | |
return nsSource.substring(with: $0.range) | |
} | |
} | |
catch _{ | |
// Need to send a custom exception. | |
return [String]() | |
} | |
} | |
// Searches for the known word matching. | |
// Suits the case when there is no spelling mistakes in your text. | |
private func find(words: [String])-> Set<String>{ | |
var matches: [[String]] = [[String]]() | |
for (_, word) in words.enumerated() { | |
// This is going to take N number of times for searching it. | |
matches.append(trie.findWordsWithPrefix(prefix: word)) | |
} | |
return Set<String>(matches.flatMap{$0}) | |
} | |
private func editsForOne(word: String)-> Set<String>{ | |
let split: [(String, String)] = self.getSplit(forWord: word) | |
return self.find(words: Array(self.gerenateEditForOne(forSplit: split))) | |
} | |
private func gerenateEditForOne(forSplit: [(String, String)]) -> Set<String>{ | |
var deletes: [String] = [String]() | |
var transposes: [String] = [String]() | |
var replaces: [String] = [String]() | |
var inserts: [String] = [String]() | |
for (_, value) in forSplit.enumerated() { | |
let left = "\(value.0)" | |
let right = "\(value.1)" | |
let sourceLength = right.characters.count | |
if(sourceLength > 1){ | |
// generate transpose | |
let range = Range(2 ..< right.characters.count) | |
let transposedWord = "\(left)\(right[1])\(right[0])\(right[range])" | |
transposes.append(transposedWord) | |
} | |
if(sourceLength > 0){ | |
// generates deletes | |
let deleteRange = Range(1 ..< right.characters.count) | |
let deletedWord = "\(left)\(right[deleteRange])" | |
deletes.append(deletedWord) | |
// generates replaces. | |
for c in letters.characters.enumerated() { | |
let replaceRange = Range(1 ..< right.characters.count) | |
let repacedWord = "\(left)\(c.element)\(right[replaceRange])" | |
replaces.append(repacedWord) | |
// generates inserts | |
let insertedWord = "\(left)\(c.element)\(right)" | |
inserts.append(insertedWord) | |
} | |
} | |
} | |
return Set<String>([deletes, transposes, replaces, inserts].flatMap{$0}) | |
} | |
private func getDeletes(forSplit: [(String, String)]) -> Set<String>{ | |
return Set<String>() | |
} | |
private func getSplit(forWord: String)-> [(String, String)]{ | |
var splits:[(String, String)] = [(String, String)]() | |
var length = forWord.characters.count | |
for i in 0...forWord.characters.count { | |
let forwardRange = NSMakeRange(0, i) | |
let reverseRange = NSMakeRange(i, length) | |
let firstString = (forWord as NSString).substring(with: forwardRange) | |
let secondString = (forWord as NSString).substring(with: reverseRange) | |
splits.append((firstString, secondString)) | |
length = length - 1 | |
} | |
return splits; | |
} | |
private func convertTo( mutable: String)-> String{ | |
// we are doing this to avoid segfault. | |
// i.e. making the immutable mutable. | |
var mutable = mutable | |
let leftShift = mutable.withMutableCharacters({ (chars) -> String.CharacterView in | |
return chars | |
}) | |
return String(leftShift) | |
} | |
//MARK: Public functions | |
//PS: Need to write a benchmark for this. | |
//retuns back an array of corrected text. | |
func getCorrected(text: String) -> Set<String> { | |
// standard find. | |
let standardSearch = self.find(words:[text]) | |
if(standardSearch.count < 1) { | |
let editForOneSearch = self.editsForOne(word: text) | |
if(editForOneSearch.count < 1){ | |
var secondList:[[String]] = [[String]]() | |
let splits = self.getSplit(forWord: text) | |
for (_, word) in self.gerenateEditForOne(forSplit: splits).enumerated() { | |
let splitsForSecond = self.getSplit(forWord: word) | |
secondList.append(Array(self.gerenateEditForOne(forSplit: splitsForSecond))) | |
} | |
return self.find(words: secondList.flatMap{$0}) | |
} | |
else { | |
return editForOneSearch | |
} | |
} | |
else { | |
return standardSearch | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment