-
-
Save roccogalluzzo/5804686 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
prefix_code.py | |
Copyright 2012-2013 Josiah Carlson | |
Released under the GNU LGPL v 2.1 license | |
This module offers the ability to encode/decode a sequence of integers into | |
strings that can then be used to compare sequences of integers (or paths on | |
trees) quickly. This module was originally intended to be used in a case- | |
preserving index in a relational database (where 'Z' comes before 'a', as is | |
the case with binary coallation). | |
Methods *very similar* to this were used to handle paginated tree-structured | |
comments for the YouTube Groups forums (which was used from August 2009 to | |
December 2010). My impotice to write this code came as a result of reading the | |
following blog post, an realizing that I'd solved the same problem a couple | |
years before: | |
http://justcramer.com/2012/04/08/using-arrays-as-materialized-paths-in-postgres/ | |
I don't have a name for this type of coding, though it shares some concepts | |
of golomb codes (specifying the length of the sub-code, while at the same time | |
encoding information about how many bytes are used to encode the number. | |
There are a few other similar ideas that could (for example) use 1 bit of | |
every X grouped bits to offer magnitude scaling (which would be 1 bit of every | |
6 bits below), but those methods do not offer the same sort of comparison | |
consistency that this method offers. | |
''' | |
from itertools import chain | |
import string | |
# The choice and order of these values is such that each character is sorted | |
# with respect to other characters, and the relative order of integers is the | |
# same before and after conversion to a string. | |
to_char = '!.' + string.digits + string.uppercase + string.lowercase | |
to_int = dict((k,v) for v,k in enumerate(to_char)) | |
cutoff = len(to_char) - 11 | |
# 0 ... cutoff-1 -> single char | |
# with X digits, x > 1 you can represent values: | |
# cutoff + len(to_char)**(X-2) - 1 to cutoff + len(to_char)**(X-1) - 1 | |
# So with a a cutoff of 64 - 11, you can represent 0 to 2**66 + cutoff - 1 | |
def to_chars(value): | |
''' | |
Converts an integer to its text-encoded form. | |
''' | |
if value < cutoff: | |
return to_char[value] | |
value -= cutoff | |
out = [] | |
while value: | |
value, rem = divmod(value, len(to_char)) | |
out.append(to_char[rem]) | |
# handle when value == cutoff | |
if not out: | |
out.append(to_char[value]) | |
out.append(to_char[cutoff + len(out) - 1]) | |
out.reverse() | |
return ''.join(out) | |
def to_val(chars): | |
''' | |
Converts characters from the provided string back into their integer | |
representation, and returns both the desired integer as well as the number | |
of bytes consumed from the character string (this function can accept a | |
string that is the result of a concatenation of results from to_chars() ). | |
''' | |
first = to_int[chars[0]] | |
if first < cutoff: | |
return first, 1 | |
first -= cutoff - 1 | |
dec = [] | |
for ch in chars[1:1+first]: | |
dec.append(to_int[ch]) | |
value = dec.pop() + cutoff | |
m = len(to_char) | |
while dec: | |
value += m * dec.pop() | |
m *= len(to_char) | |
return value, first + 1 | |
def from_sequence(lst): | |
''' | |
Converts a sequence of integers into a string that represents those | |
integers. | |
''' | |
return ''.join(map(to_chars, lst)) | |
def from_string(str): | |
''' | |
Converts a string that represents a sequence of integers back into a list | |
of integers. | |
''' | |
out = [] | |
i = 0 | |
while i < len(str): | |
this, used = to_val(str[i:]) | |
out.append(this) | |
i += used | |
return out | |
def from_sequence_delta(lst): | |
''' | |
The same as from_sequence(), except that each subsequent integer has the | |
previous integer subtracted from it. This can reduce data storage | |
somewhat, depending on your data. | |
This method requires that all subsequent values in the sequence are at | |
least as large as those proceeding it. Which is to say the following are | |
good sequences: | |
[1, 2, 3] | |
[1, 1, 1, 2, 2, 2] | |
But the following are bad sequences: | |
[1, 2, 3, 2] | |
[1, 0] | |
''' | |
return from_sequence(this - prev for prev, this in zip(chain([0], lst), lst)) | |
def from_string_delta(str): | |
''' | |
The reverse of from_sequence_delta(). | |
''' | |
d = from_string(str) | |
for i in xrange(1, len(d)): | |
d[i] += d[i-1] | |
return d | |
def test(): | |
import random | |
for i in xrange(0, 1024, 100): | |
x = [i] | |
y = from_sequence(x) | |
z = from_string(y) | |
assert x == z, (x, y, z) | |
print (x, y, z) | |
x = [2**random.randrange(10) + random.randrange(128) for i in xrange(10)] | |
y = from_sequence(x) | |
z = from_string(y) | |
assert x == z, (x, y, z) | |
print (x, y, z) | |
x.sort() | |
y = from_sequence_s(x) | |
z = from_string_s(y) | |
assert x == z, (x, y, z) | |
print (x, y, z) | |
x = [2**66 + cutoff - 1] | |
y = from_sequence(x) | |
z = from_string(y) | |
assert x == z, (x, y, z) | |
print (x, y, z) | |
for i in xrange(1000): | |
x = [[random.randrange(10000)] for i in xrange(100)] | |
y = map(from_sequence, x) | |
x.sort() | |
y.sort() | |
z = map(from_string, y) | |
assert x == z, (x, y, z) | |
print hex(from_string('zzzzzzzzzzzz')[0] - cutoff).count('f') | |
if __name__ == '__main__': | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment