Created
March 21, 2021 09:17
-
-
Save jtanx/810af916741d7bcc5c909f3545021e73 to your computer and use it in GitHub Desktop.
3 vs 4-level lookup tables for unicode data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on the algorithm in https://github.com/python/cpython/blob/master/Tools/unicode/makeunicodedata.py | |
import math | |
import sys | |
def getsize(data): | |
# return smallest possible integer size for the given array | |
maxdata = max(data) | |
if maxdata < 256: | |
return 1 | |
elif maxdata < 65536: | |
return 2 | |
else: | |
return 4 | |
def splitbinso(t, trace=0): | |
"""t, trace=0 -> (t1, t2, shift). Split a table to save space. | |
t is a sequence of ints. This function can be useful to save space if | |
many of the ints are the same. t1 and t2 are lists of ints, and shift | |
is an int, chosen to minimize the combined size of t1 and t2 (in C | |
code), and where for each i in range(len(t)), | |
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] | |
where mask is a bitmask isolating the last "shift" bits. | |
If optional arg trace is non-zero (default zero), progress info | |
is printed to sys.stderr. The higher the value, the more info | |
you'll get. | |
""" | |
if trace: | |
def dump(t1, t2, shift, bytes): | |
print("%d+%d bins at shift %d; %d bytes" % ( | |
len(t1), len(t2), shift, bytes), file=sys.stderr) | |
print("Size of original table:", len(t)*getsize(t), "bytes", | |
file=sys.stderr) | |
n = len(t)-1 # last valid index | |
maxshift = 0 # the most we can shift n and still have something left | |
if n > 0: | |
while n >> 1: | |
n >>= 1 | |
maxshift += 1 | |
del n | |
bytes = sys.maxsize # smallest total size so far | |
t = tuple(t) # so slices can be dict keys | |
for shift in range(maxshift + 1): | |
t1 = [] | |
t2 = [] | |
size = 2**shift | |
bincache = {} | |
for i in range(0, len(t), size): | |
bin = t[i:i+size] | |
index = bincache.get(bin) | |
if index is None: | |
index = len(t2) | |
bincache[bin] = index | |
t2.extend(bin) | |
t1.append(index >> shift) | |
# determine memory size | |
b = len(t1)*getsize(t1) + len(t2)*getsize(t2) | |
if trace > 1: | |
dump(t1, t2, shift, b) | |
if b < bytes: | |
best = t1, t2, shift | |
bytes = b | |
t1, t2, shift = best | |
if trace: | |
print("Best:", end=' ', file=sys.stderr) | |
dump(t1, t2, shift, bytes) | |
if __debug__: | |
# exhaustively verify that the decomposition is correct | |
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits | |
for i in range(len(t)): | |
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] | |
return best | |
def splitbins(arr): | |
maxshift = max(math.frexp(len(arr)-1)[1] - 1, 0) | |
data = tuple(arr) | |
maxsize = sys.maxsize | |
best = None | |
for shift1 in range(maxshift + 1): | |
for shift2 in range(0, shift1 + 1): | |
t1, t2, t3 = [], [], [] | |
size1 = 1 << shift1 | |
size2 = 1 << shift2 | |
mask1 = ~((~0)<<(shift1 - shift2)) | |
mask2 = ~((~0)<<shift2) | |
t2cache, t3cache = {}, {} | |
# print("SHIFTS", shift1, shift2, "SIZES", size1, size2) | |
for k in range(0, len(data), size1): | |
# steps of A | |
indices = [] | |
for l in range(0, size1, size2): | |
# steps of B | |
# print(" BIN SIZE", size2, k+l, k+l+size2) | |
bin = data[k+l:k+l+size2] | |
index = t3cache.get(bin) | |
if index is None: | |
index = len(t3) | |
t3cache[bin] = index | |
t3.extend(bin) | |
indices.append(index) | |
bin = tuple(indices) | |
# print(" INDICES", bin) | |
index = t2cache.get(bin) | |
if index is None: | |
index = len(t2) | |
t2cache[bin] = index | |
t2.extend(bin) | |
t1.append(index) | |
size = len(t1) * getsize(t1) + len(t2) * getsize(t2) + len(t3) * getsize(t3) | |
if size < maxsize: | |
maxsize = size | |
best = (shift1, shift2, t1, t2, t3) | |
for i in range(len(data)): | |
v = arr[i] | |
vv = t3[t2[t1[i >> shift1] + ((i >> shift2) & mask1)] + (i & mask2)] | |
if v != vv: | |
print("WTF", "Shifts", shift1, shift2, "At", i, "values", v, "vs", vv) | |
return None | |
print(shift1, shift2, len(t1), len(t2), len(t3), size) | |
print("MIN", maxsize, best[0], best[1]) | |
return best |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment