Created
June 11, 2015 02:09
-
-
Save dennislysenko/0cc812ba7a87ddf57817 to your computer and use it in GitHub Desktop.
Fuzzy String Matching (BK Tree)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Fuzzy string-matching algorithm | |
// Implementation of algorithm described at: http://blog.notdot.net/2007/4/Damn-Cool-Algorithms-Part-1-BK-Trees | |
// Essentially builds an index of strings by levenshtein distance (N-ary tree) on which you can run range queries. | |
// The root node can be chosen arbitrarily. Each node holds a string and a collection of edges, representing distance to other strings, which then have their own children and so on, building a complete index. | |
// See https://github.com/vy/bk-tree for (impressive) performance statistics despite this tending to create an unbalanced N-ary tree | |
class BKTreeNode { | |
var value: String | |
var edges: Dictionary<Int, BKTreeNode> | |
init(value: String) { | |
self.value = value | |
self.edges = Dictionary<Int, BKTreeNode>() | |
} | |
func childWithDistance(distance: Int) -> BKTreeNode? { | |
return edges[distance] | |
} | |
func addChild(string: String, distance: Int) { | |
edges[distance] = BKTreeNode(value: string) | |
} | |
} | |
func insertIntoBKTree(root: BKTreeNode, string: String) { | |
let initialDistance = levenshtein(string, root.value) | |
// try to insert a node with this string unless there is already a child node on root with the same distance as we just calculated | |
// if there is one with the same distance, we keep recurring | |
if let child = root.childWithDistance(initialDistance) { | |
insertIntoBKTree(child, string) | |
} else { | |
root.addChild(string, distance: initialDistance) | |
} | |
} | |
func queryBKTree(root: BKTreeNode, query: String, range: Int) -> [String] { | |
let initialDistance = levenshtein(query, root.value) | |
// the way these trees are laid out, even when we find a match, we need to keep recurring | |
var results: [String] = [] | |
if initialDistance <= range { | |
results.append(root.value) | |
} | |
//n = range, d = initialDistance | |
//range of possible distances where we might find matches: [d-n,d+n] by the triangle inequality | |
//translates to [initialDistance-range, initialDistance+range] | |
let min = initialDistance - range | |
let max = initialDistance + range | |
for distance in min...max { | |
if let child = root.childWithDistance(distance) { | |
let tempResults = queryBKTree(child, query, range) | |
//results.splice(tempResults, atIndex: 0) // Is this a performance concern..? | |
for result in tempResults { // see: Occam's razor | |
results.append(result) | |
} | |
} | |
} | |
return results | |
} | |
// Demo | |
let names = ["Dennis", // 0 | |
"Mennis", "Tennis", "Aennis", "Denis", // 1 | |
"Deddis", "Dinnes", // 2 | |
"Den", // 3 | |
"Dinesh"] // 4 | |
let allNamesButFirst = names[1..<names.count] | |
// arbitrarily choose element 0 to be the root. theoretically, choosing "Deddis" or "Dinnes" would create the most balanced tree. however I question the necessity of proper balancing in this algorithm | |
let root = BKTreeNode(value: names[0]) | |
for name in names[1..<names.count] { | |
insertIntoBKTree(root, name) | |
} | |
queryBKTree(root, "Dennis", 0) // exact match => ["Dennis"] | |
queryBKTree(root, "Dennis", 1) // => ["Dennis", "Mennis", "Tennis", "Aennis", "Denis"] | |
queryBKTree(root, "Dennis", 3) // => ["Dennis", "Mennis", "Tennis", "Aennis", "Denis", "Deddis", "Dinnes", "Den"] |
loop through all the distancefor distance in min...max
may affect the performance b/c some nodes may have less children than that, which means you loop more
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note you must provide your own
levenshtein
implementation; look at https://gist.github.com/bgreenlee/52d93a1d8fa1b8c1f38b for a suitable one.