Last active
August 23, 2024 08:17
-
-
Save jd/b0aa5cbfa42f4eb23eb9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
# | |
# Copyright © 2016 Red Hat, Inc. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you may | |
# not use this file except in compliance with the License. You may obtain | |
# a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
# License for the specific language governing permissions and limitations | |
# under the License. | |
import struct | |
import bitarray | |
def double_to_int(f): | |
"""Convert a 64 bits float to a 64 bits integer.""" | |
return struct.unpack("!Q", struct.pack("!d", f))[0] | |
def int_to_double(i): | |
"""Convert an integer to a 64 bits float.""" | |
return struct.unpack("!d", struct.pack("!Q", i))[0] | |
def to_binary64(v): | |
"""Convert value to a binary representation on 64 bits as a string.""" | |
return "{:064b}".format(v) | |
def count_lead_and_trail_zeroes(d): | |
"""Count the number of leading and trailing zeroes in an integer.""" | |
as_str = "{:64b}".format(d) | |
try: | |
leading_zeroes = as_str.index("1") | |
trailing_zeroes = 63 - as_str.rindex("1") | |
except ValueError: | |
return 64, 64 | |
return leading_zeroes, trailing_zeroes | |
def encode_xor_floats(floats): | |
"""Encode a list of floats using XOR encoding. | |
Encoding stolen from paper: | |
"Gorilla: A Fast, Scalable, In-Memory Time Series Database" | |
""" | |
if len(floats) == 0: | |
return b"" | |
prev = double_to_int(floats[0]) | |
encoded = bitarray.bitarray(to_binary64(prev)) | |
prev_lz = prev_tz = 0 | |
for v in floats[1:]: | |
v = double_to_int(v) | |
xor = prev ^ v | |
if xor == 0: | |
encoded.append(0) | |
else: | |
encoded.append(1) | |
lz, tz = count_lead_and_trail_zeroes(xor) | |
if lz >= 32: | |
lz = 31 | |
if prev_lz != 0 and lz >= prev_lz and tz >= prev_tz: | |
encoded.append(0) | |
encoded.extend( | |
to_binary64(xor >> prev_tz)[- (64 - prev_lz - prev_tz):]) | |
else: | |
prev_lz = lz | |
prev_tz = tz | |
encoded.append(1) | |
encoded.extend(to_binary64(lz)[-5:]) | |
sigbits = 64 - lz - tz | |
encoded.extend(to_binary64(sigbits)[-6:]) | |
encoded.extend(to_binary64(xor >> tz)[-sigbits:]) | |
prev = v | |
return encoded.tobytes() | |
_ARRAY_3_ZERO = bitarray.bitarray('000') | |
_ARRAY_2_ZERO = bitarray.bitarray('00') | |
def decode_xor_floats(encoded, number_of_values): | |
if number_of_values == 0: | |
return [] | |
array = bitarray.bitarray() | |
array.frombytes(encoded.tobytes()) | |
results = [] | |
# Read first value | |
results.append(struct.unpack("!d", array[:64].tobytes())[0]) | |
index = 64 | |
for _ in range(number_of_values - 1): | |
index += 1 | |
if array[index - 1]: | |
index += 1 | |
if array[index - 1]: | |
bits = _ARRAY_3_ZERO + array[index:index + 5] | |
leading = struct.unpack("!B", bits.tobytes())[0] | |
bits = _ARRAY_2_ZERO + array[index + 5:index + 11] | |
trailing = 64 - leading - struct.unpack( | |
"!B", bits.tobytes())[0] | |
index += 11 | |
sigbits = 64 - leading - trailing | |
bits = bitarray.bitarray( | |
'0' * (64 - sigbits)) + array[index:index + sigbits] | |
num = struct.unpack("!Q", bits.tobytes())[0] | |
index += sigbits | |
vbits = double_to_int(results[-1]) | |
vbits ^= num << trailing | |
results.append(int_to_double(vbits)) | |
else: | |
results.append(results[-1]) | |
return results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello!
I am playing around with different compression schemes and happened upon your Python implementation of the Gorilla float compression. Nice that you made one :-)
I think I found a minor bug:
If the number of significant bits is 64 (the maximum), in line 82 it will be encoded as '000000' ('1000000' = 64, and then only the last 6 bits).
In line 114, where you decode the number of significant bits, this will then be read as 0 (zero) significant bits, so the decoding will be faulty.
This happened to me during my testing, but not on all files 64 significant bits are encountered, so it can be hard to spot.
During decoding, one should use
sigbits = 64 if sigbits == 0 else sigbits
e.g. by inserting it in line 117 (or maybe there is a more elegant way...).
Cheers!