Created
March 3, 2010 12:51
-
-
Save pervognsen/320586 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def all_equal(iterable): | |
it = iter(iterable) | |
first = next(it, None) | |
return all(x == first for x in it) | |
def median(seq, key=lambda x: x): | |
return sorted(seq, key=key)[len(seq) / 2] | |
def partition(seq, pivot, key=lambda x: x): | |
left, middle, right = [], [], [] | |
for x in seq: | |
if key(x) < key(pivot): | |
left.append(x) | |
elif key(x) > key(pivot): | |
right.append(x) | |
else: | |
middle.append(x) | |
return left, middle, right | |
# --- | |
def hamming_distance(xs, ys): | |
return abs(len(xs) - len(ys)) + sum(1 for x, y in zip(xs, ys) if x != y) | |
# --- | |
def kdentry(key, value): | |
return (key, value) | |
def kdkey(entry): | |
return entry[0] | |
def kdvalue(entry): | |
return entry[1] | |
class kdleaf: | |
def __init__(self, entries): | |
self.entries = entries | |
def size(self): | |
return len(self.entries) | |
def depth(self): | |
return 1 | |
def search(self, center, radius): | |
return (kdvalue(entry) for entry in self.entries) | |
class kdnode: | |
def __init__(self, dimension, separator, left, middle, right): | |
self.dimension = dimension | |
self.separator = separator | |
self.left = left | |
self.middle = middle | |
self.right = right | |
def size(self): | |
return self.left.size() + self.middle.size() + self.right.size() | |
def depth(self): | |
return 1 + max(self.left.depth(), self.middle.depth(), self.right.depth()) | |
def search(self, center, radius): | |
if center[self.dimension] - radius < self.separator: | |
for value in self.left.search(center, radius): | |
yield value | |
if abs(center[self.dimension] - self.separator) <= radius: | |
for value in self.middle.search(center, radius): | |
yield value | |
if center[self.dimension] + radius > self.separator: | |
for value in self.right.search(center, radius): | |
yield value | |
kdtree_split_threshold = 64 | |
def build_kdtree(dimensions, entries, split_dimension=0): | |
if len(entries) < kdtree_split_threshold or all_equal(kdkey(entry) for entry in entries): | |
return kdleaf(entries) | |
split_key = lambda entry: kdkey(entry)[split_dimension] | |
pivot = median(entries, key=split_key) | |
left, middle, right = partition(entries, pivot, key=split_key) | |
next_split_dimension = (split_dimension + 1) % dimensions | |
if left or right: | |
return kdnode(split_dimension, split_key(pivot), | |
build_kdtree(dimensions, left, next_split_dimension), | |
build_kdtree(dimensions, middle, next_split_dimension), | |
build_kdtree(dimensions, right, next_split_dimension)) | |
else: | |
return build_kdtree(dimensions, entries, next_split_dimension) | |
# --- | |
def word_vector(word): | |
word = word.lower() | |
coords = [0] * 26 | |
for char in word: | |
coords[ord(char) - ord('a')] += 1 | |
return tuple(coords) | |
def build_word_tree(words): | |
return build_kdtree(26, [kdentry(word_vector(word), word) for word in words]) | |
# TODO: Use Levenshtein distance for false-positive filtering. Hamming distance is an upper bound. | |
def word_search(word_tree, word, radius): | |
candidates = word_tree.search(word_vector(word), radius) | |
return [candidate for candidate in candidates if hamming_distance(word, candidate) <= radius] | |
# --- | |
words = [word.strip() for word in open("/usr/share/dict/words")] | |
word_tree = build_word_tree(words) | |
print len(words) | |
print word_tree.size() | |
print word_tree.depth() | |
print word_search(word_tree, "book", 0) | |
print word_search(word_tree, "book", 1) | |
print word_search(word_tree, "book", 2) | |
print word_search(word_tree, "book", 3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment