Created
June 24, 2012 02:09
-
-
Save dimo414/2981061 to your computer and use it in GitHub Desktop.
Testing Prefix Lookup Data Structures
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bisect | |
import error; | |
class Prefix: | |
''' | |
A prefix data structure built on a sorted list, which uses binary search. | |
Note that for more reasonable lookup, it only searches in lower | |
case. This means there can be colliding strings such as 'prefix' vs 'Prefix'. | |
In this case, more recent inserts override older ones. | |
''' | |
def __init__(self, ls=[], presorted=False): | |
self._aliases = {} | |
self._list = ls if presorted else sorted(ls) | |
self._keys = [s.lower() for s in self._list] | |
# Note that since we usually use these methods together, it's wasteful to | |
# Compute prefix.lower() for both - as such, these methods assume prefix | |
# is already lower case. | |
def _getindex(self, prefix): | |
return bisect.bisect_left(self._keys, prefix) | |
def _getnextindex(self, prefix): | |
# http://stackoverflow.com/a/7381253/113632 | |
lo, hi = 0, len(self._keys) | |
while lo < hi: | |
mid = (lo+hi)//2 | |
if prefix < self._keys[mid] and not self._keys[mid].startswith(prefix): hi = mid | |
else: lo = mid+1 | |
return lo | |
def __getitem__(self, prefix): | |
'''Return the item with the given prefix. | |
If more than one item matches the prefix an AmbiguousPrefix exception | |
will be raised, unless the prefix is the entire ID of one task. | |
If no items match the prefix an UnknownPrefix exception will be raised. | |
If an item exactly matches the prefix, it will be returned even if | |
there exist other (longer) items which match the prefix | |
''' | |
pre = prefix.lower() | |
ret = self._list[self._getindex(pre):self._getnextindex(pre)] | |
if ret: | |
if len(ret) == 1 or ret[0].lower() == pre: | |
return ret[0] | |
raise error.AmbiguousPrefix(prefix,ret) | |
raise error.UnknownPrefix(prefix) | |
def prefix(self, item): | |
'''Return the unique prefix of the given item, or None if not found''' | |
ln = len(self._keys) | |
item = item.lower() | |
index = self._getindex(item) | |
if index >= ln: | |
return None | |
match = self._keys[index] | |
if not match.startswith(item): | |
return None | |
siblings = [] | |
if index > 0: | |
siblings.append(self._keys[index-1]) | |
if index < ln-1: | |
siblings.append(self._keys[index+1]) | |
if not siblings: #list contains only item | |
return match[0] | |
return self._uniqueprefix(match,siblings) | |
def _uniqueprefix(self,match,others): | |
'''Returns the unique prefix of match, against the set of others''' | |
ret = [] | |
#print("START:",match,others) | |
while match: | |
others = [s[1:] for s in others if s and s[0] == match[0]] | |
ret.append(match[0]) | |
match = match[1:] | |
#print("WHILE:",match,others,''.join(ret)) | |
if not others: | |
return ''.join(ret) | |
return None | |
def add(self,item): | |
'''Add an item to the data structure. | |
This uses list.insert() which is O(n) - for many insertions, | |
it may be dramatically faster to simply build a new Prefix entirely.''' | |
lower = item.lower() | |
index = self._getindex(lower) | |
# If overwriting same key | |
if index < len(self._keys) and self._keys[index] == lower: | |
self._list[index] = item | |
else: | |
self._keys.insert(index,lower) | |
self._list.insert(index,item) | |
def alias(self,alias,item): | |
'''Add an item to the trie which maps to another item''' | |
self._aliases[alias] = self[item] | |
self.add(alias) | |
def pref_str(self,pref,short=False): | |
'''returns the item with a colon indicating the shortest unique prefix''' | |
item = self[pref] | |
pref = self.prefix(item) | |
tail = item[len(pref):] | |
return item[:len(pref)]+':'+(tail[:4]+('...' if len(tail) > 4 else '') if short else tail) | |
def __iter__(self): | |
return iter(self._list) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class UnknownPrefix(Exception): | |
'''Raised if a given prefix does not map to any items''' | |
def __init__(self,prefix,*args): | |
Exception.__init__(self,*args) | |
self.prefix = prefix | |
class AmbiguousPrefix(Exception): | |
'''Raised if a given prefix maps to multiple items''' | |
def __init__(self,prefix,choices,*args): | |
Exception.__init__(self,*args) | |
self.prefix = prefix | |
self.choices = choices |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib,time | |
import bprefix,tprefix | |
def timed(f): | |
def func(*args): | |
start = time.time() | |
ret = f(*args) | |
took = time.time() - start | |
print("%s took %f" % (f.__name__,took)) | |
return ret | |
return func | |
def get_generator(top=250000,static="Static_String"): | |
return (hashlib.sha1((static+str(i)).encode('utf-8')).hexdigest() for i in range(top)) | |
@timed | |
def build_from_list(cls): | |
return cls(get_generator()) | |
@timed | |
def build_from_adds(cls): | |
pref = cls() | |
for s in get_generator(): | |
pref.add(s) | |
return pref | |
@timed | |
def add_to(obj): | |
for s in get_generator(10000,"Different_String"): | |
obj.add(s) | |
@timed | |
def get(obj,loops=10000): | |
for _ in range(loops): | |
obj['000188'] | |
obj['1971e'] | |
obj['336f7'] | |
obj['4d120'] | |
obj['66ada'] | |
obj['80736'] | |
obj['99cb0'] | |
obj['b38f3'] | |
obj['ccfd9e8'] | |
obj['e61df'] | |
@timed | |
def prefix(obj,loops=10000): | |
for _ in range(loops): | |
obj.prefix('00018855b442bfba15fae6949982ef63d9eba1c9') | |
obj.prefix('1971e17df8ee57f0dcceccc869db454b7c6b7a54') | |
obj.prefix('336f7b09c7c0c933b1f26ca09a84585818046e6b') | |
obj.prefix('4d1209fadf843f65bd2beee37db552134a930395') | |
obj.prefix('66adaf0cb611d71554153631611f7904781addef') | |
obj.prefix('80736f454201d96c4c795bb5e21778550a9cbef0') | |
obj.prefix('99cb006fd81cb84cfbae834ed7b7f977c29af249') | |
obj.prefix('b38f3561591650708fce739536ac504f86fecdf5') | |
obj.prefix('ccfd9e8211621666c55f911d1ff3f13ab93f696e') | |
obj.prefix('e61df82a0c2f59394eb9bd752e93b9c011df5be2') | |
@timed | |
def iter(obj): | |
for s in obj: | |
len(s) | |
def run_tests(cls): | |
pref = build_from_list(cls) | |
build_from_adds(cls) | |
add_to(pref) | |
get(pref) | |
prefix(pref) | |
iter(pref) | |
if __name__ == '__main__': | |
print("TRIE STRUCTURE") | |
run_tests(tprefix.Prefix) | |
print("BINARY SEARCH STRUCTURE") | |
run_tests(bprefix.Prefix) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import error; | |
class Prefix: | |
''' | |
A trie datastructure (http://en.wikipedia.org/wiki/Trie) which enables | |
fast data retrieval by prefix. | |
Note that for more reasonable lookup, the trie only searches in lower | |
case. This means there can be colliding strings such as 'prefix' vs 'Prefix'. | |
In this case, more recent inserts override older ones. | |
''' | |
def __init__(self,ls=[]): | |
self._root = _Node(None,None,True) | |
self._aliases = {} | |
for i in ls: | |
self._root.add(i.lower(),i) | |
def __getitem__(self, prefix): | |
'''Return the item with the given prefix. | |
If more than one item matches the prefix an AmbiguousPrefix exception | |
will be raised, unless the prefix is the entire ID of one task. | |
If no items match the prefix an UnknownPrefix exception will be raised. | |
If an item exactly matches the prefix, it will be returned even if | |
there exist other (longer) items which match the prefix | |
''' | |
matched = self._root[prefix.lower()] | |
if (matched is None or | |
(matched.result is None and len(matched.children) == 0) or | |
# this is needed because prefix will return if prefix | |
# is longer than necessary, and the necessary part | |
# matches, but the extra text does not | |
(matched.key is not None and not matched.key.startswith(prefix.lower()))): | |
raise error.UnknownPrefix(prefix) | |
if matched.result is not None: | |
if matched.result in self._aliases: | |
return self._aliases[matched.result] | |
else: return matched.result | |
else: | |
raise error.AmbiguousPrefix(prefix,iter(matched)) | |
def prefix(self,item): | |
'''Return the unique prefix of the given item, or None if not found''' | |
return self._root.prefix(item.lower()) | |
def pref_str(self,pref,short=False): | |
'''returns the item with a colon indicating the shortest unique prefix''' | |
item = self[pref] | |
pref = self.prefix(item) | |
tail = item[len(pref):] | |
return item[:len(pref)]+':'+(tail[:4]+('...' if len(tail) > 4 else '') if short else tail) | |
def add(self,item): | |
'''Add an item to the data structure''' | |
self._root.add(item.lower(),item,0) | |
def alias(self,alias,item): | |
'''Add an item to the trie which maps to another item''' | |
self._aliases[alias] = self[item] | |
self.add(alias) | |
def __iter__(self): | |
return iter(self._root) | |
class _Node: | |
'''Represents a node in the Trie. It contains either | |
an exact match, a set of children, or both | |
''' | |
def __init__(self, key, item, final=False): | |
'''Constructs a new node which contains an item''' | |
self.final = final | |
self.key = key | |
self.result = item | |
self.children = {} | |
def _tree(self): | |
'''Returns a tree structure representing the trie. Useful for debugging''' | |
return "( %s%s, { %s } )" % (self.result, '*' if self.final else '', | |
', '.join(["%s: %s" % (k,v._tree()) for (k,v) in self.children.items()])) | |
def add(self,key,item,depth=0): | |
'''Adds an item at this node or deeper. Depth indicates | |
which index is being used for comparison''' | |
# the correct node has been found, replace result with new value | |
if self.key is not None and key == self.key: | |
self.result = item #this would override an old value | |
return | |
# this is currently a leaf node, move the leave one down | |
if self.result is not None and not self.final: | |
if self.key == None: print(self.key,self.result,key,item) | |
key_letter = self.key[depth] | |
self.children[key_letter] = _Node(self.key,self.result,len(self.key)==depth+1) | |
self.key = None | |
self.result = None | |
if len(item) == depth: | |
self.key = key | |
self.result = item #this could override an old value | |
self.final = True | |
return | |
letter = key[depth] | |
if letter in self.children: | |
child = self.children[letter] | |
child.add(key,item,depth+1) | |
else: | |
self.children[letter] = _Node(key,item,len(key) == depth+1) | |
def __getitem__(self,prefix): | |
'''Given a prefix, returns the node that matches | |
This will either have a result, or if not the prefix | |
was ambiguous. If None is returned, there was no | |
such prefix''' | |
if len(prefix) == 0 or len(self.children) == 0: | |
return self | |
letter = prefix[0] | |
if letter in self.children: | |
return self.children[letter][prefix[1:]] | |
else: return None | |
def prefix(self,item): | |
'''Given an item (or a prefix) finds the shortest | |
prefix necessary to reach the given item. | |
None if item does not exist.''' | |
if len(item) == 0 or len(self.children) == 0: | |
return '' | |
letter = item[0] | |
if letter in self.children: | |
child = self.children[letter].prefix(item[1:]) | |
if child is not None: | |
return letter + child | |
return None | |
def __iter__(self): | |
'''Yields items in and below this node''' | |
if self.result: | |
yield self.result | |
for k in sorted(self.children.keys()): | |
for res in self.children[k]: | |
yield res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment