Created
November 7, 2017 18:09
-
-
Save nodirt/dc708ab1ff3f7ed39cd0c069218e8aa5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import base64, bisect, genericpath, string, zlib | |
def compress(items): | |
class Node(object): | |
def __init__(self): | |
self.children = {} | |
self.descendants = 0 | |
self.value = False | |
def compress(self): | |
def visit(self, self_key): | |
self.children = { | |
visit(child, key): child | |
for key, child in self.children.iteritems() | |
if child.children or child.value | |
} | |
if self_key and len(self.children) == 1 and not self.value: | |
[key, child], = self.children.iteritems() | |
self_key += key | |
self.children = child.children | |
self.value = child.value | |
return self_key | |
visit(self, None) | |
def update_descendants(self): | |
self.descendants = 0 | |
for c in self.children.itervalues(): | |
c.update_descendants() | |
self.descendants += 1 + c.descendants | |
def dump(self, out): | |
for key, child in sorted(self.children.iteritems()): | |
if child.descendants > 0: | |
sep = '+' if child.value else ' ' | |
out.append('%d%s%s' % (child.descendants, sep, key)) | |
elif child.value: | |
out.append(key) | |
child.dump(out) | |
root = Node() | |
for item in items: | |
node = root | |
for c in item: | |
child = node.children.get(c) | |
if not child: | |
child = Node() | |
node.children[c] = child | |
node = child | |
node.value = True | |
root.compress() | |
root.update_descendants() | |
out = [] | |
root.dump(out) | |
return out | |
def is_present(d, k): | |
i = 0 | |
iterations = 0 | |
found = False | |
orig_k = k | |
while k and i < len(d): | |
iterations += 1 | |
key = d[i] | |
descendants = 0 | |
value = True | |
if ' ' in key or '+' in key: | |
value = '+' in key | |
sep = '+' if value else ' ' | |
descendants, key = key.split(sep, 2) | |
descendants = int(descendants) | |
if k == key and value: | |
found = True | |
break | |
if k.startswith(key): | |
if descendants == 0: | |
break | |
k = k[len(key):] | |
i += 1 # we must go deeper | |
elif key > k: | |
break | |
else: | |
i += descendants + 1 # next sibling | |
print('search for %s took %d iterations' % (orig_k, iterations)) | |
return found | |
def test2(): | |
engs = open('eng.txt').read().splitlines() | |
flat_engs = '\n'.join(engs) | |
d = compress(engs) | |
flat_d = '\n'.join(d) | |
open('eng.tree', 'w').write(flat_d) | |
print 'Raw data descendants: %d' % len(flat_engs) | |
print 'Encoded descendants: %d' % len(flat_d) | |
print 'Compressed descendants: %d' % len(zlib.compress(flat_d)) | |
print 'Compressed base64: %d' % len(base64.b64encode(zlib.compress(flat_d))) | |
# Show how it looks like: | |
print flat_d[:100] | |
decoded = flat_d.split('\n') | |
matches = ['maruel', 'vadimsh'] | |
for m in matches: | |
assert is_present(decoded, m), m | |
misses = ['xxxxx', 'aaaaaaa', 'qwerty'] | |
for m in misses: | |
assert not is_present(decoded, m), m | |
test2() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment