Created
November 11, 2023 15:02
-
-
Save Geograph-us/6b82845a6112e690862cba09c00a285d to your computer and use it in GitHub Desktop.
ulong (UInt64) compression based on https://github.com/wolfgarbe/EliasFanoCompression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
byte[] EliasFanoCompress(ulong[] UlongsArray) | |
{ | |
long maxCompressedSize = (long)((2 + Math.Ceiling(Math.Log((double)UlongsArray[UlongsArray.Length - 1] / (double)UlongsArray.Length, 2))) * UlongsArray.Length / 8) + 6; | |
byte[] compressedBuffer = new byte[maxCompressedSize]; | |
ulong compressedBufferPointer2 = 0; | |
ulong compressedBufferPointer1 = 0; | |
UInt64 lastID = 0; | |
UInt64 buffer1 = 0; | |
UInt64 bufferLength1 = 0; | |
UInt64 buffer2 = 0; | |
UInt64 bufferLength2 = 0; | |
ulong largestblockID = (ulong)UlongsArray[UlongsArray.Length - 1]; | |
double averageDelta = (double)largestblockID / (double)UlongsArray.Length; | |
double averageDeltaLog = Math.Log(averageDelta, 2); | |
ulong lowBitsLength = (ulong)Math.Floor(averageDeltaLog); | |
if (lowBitsLength < 0) lowBitsLength = 0; | |
ulong lowBitsMask = (((ulong)1 << (int)lowBitsLength) - 1); | |
compressedBufferPointer2 = lowBitsLength * (ulong)UlongsArray.Length / 8 + 6; | |
compressedBuffer[compressedBufferPointer1++] = (byte)(UlongsArray.Length & 255); | |
compressedBuffer[compressedBufferPointer1++] = (byte)((UlongsArray.Length >> 8) & 255); | |
compressedBuffer[compressedBufferPointer1++] = (byte)((UlongsArray.Length >> 16) & 255); | |
compressedBuffer[compressedBufferPointer1++] = (byte)((UlongsArray.Length >> 24) & 255); | |
compressedBuffer[compressedBufferPointer1++] = (byte)lowBitsLength; | |
foreach (ulong docID in UlongsArray) | |
{ | |
ulong docIdDelta = (docID - lastID - 1); | |
buffer1 <<= (int)lowBitsLength; | |
buffer1 |= (docIdDelta & lowBitsMask); | |
bufferLength1 += lowBitsLength; | |
while (bufferLength1 > 7) | |
{ | |
bufferLength1 -= 8; | |
compressedBuffer[compressedBufferPointer1++] = (byte)(buffer1 >> (int)bufferLength1); | |
} | |
ulong unaryCodeLength = (ulong)(docIdDelta >> (int)lowBitsLength) + 1; | |
buffer2 <<= (int)unaryCodeLength; | |
buffer2 |= 1; | |
bufferLength2 += unaryCodeLength; | |
while (bufferLength2 > 7) | |
{ | |
bufferLength2 -= 8; | |
compressedBuffer[compressedBufferPointer2++] = (byte)(buffer2 >> (int)bufferLength2); | |
} | |
lastID = docID; | |
} | |
if (bufferLength1 > 0) compressedBuffer[compressedBufferPointer1++] = (byte)(buffer1 << (int)(8 - bufferLength1)); | |
if (bufferLength2 > 0) compressedBuffer[compressedBufferPointer2++] = (byte)(buffer2 << (int)(8 - bufferLength2)); | |
Array.Resize(ref compressedBuffer, (int)compressedBufferPointer2); | |
return compressedBuffer; | |
} | |
public static UInt64[,] decodingTableHighBits = new UInt64[256, 8]; | |
public static byte[] decodingTableNumber = new byte[256]; | |
public static byte[] decodingTableHighBitsCarryover = new byte[256]; | |
UInt64[] EliasFanoDecompress(byte[] compressedBuffer) | |
{ | |
if (decodingTableHighBitsCarryover[0] == 0) | |
{ | |
for (int i = 0; i < 256; i++) | |
{ | |
byte zeroCount = 0; | |
for (int j = 7; j >= 0; j--) | |
{ | |
if ((i & (1 << j)) > 0) | |
{ | |
decodingTableHighBits[i, decodingTableNumber[i]] = zeroCount; | |
decodingTableNumber[i]++; | |
zeroCount = 0; | |
} | |
else zeroCount++; | |
} | |
decodingTableHighBitsCarryover[i] = zeroCount; | |
} | |
} | |
UInt64 compressedBufferPointer = (ulong)compressedBuffer.LongLength; | |
UInt64 resultPointer = 0; | |
ulong lowBitsPointer = 0; | |
ulong lastDocID = 0; | |
ulong docID = 0; | |
UInt64 postingListCount = compressedBuffer[lowBitsPointer++]; | |
postingListCount |= (UInt64)compressedBuffer[lowBitsPointer++] << 8; | |
postingListCount |= (UInt64)compressedBuffer[lowBitsPointer++] << 16; | |
postingListCount |= (UInt64)compressedBuffer[lowBitsPointer++] << 24; | |
ulong[] decompressedUlongs = new ulong[postingListCount]; | |
byte lowBitsLength = compressedBuffer[lowBitsPointer++]; | |
byte lowBitsCount = 0; | |
byte lowBits = 0; | |
byte cb = 1; | |
for (UInt64 highBitsPointer = lowBitsLength * postingListCount / 8 + 6; highBitsPointer < compressedBufferPointer; highBitsPointer++) | |
{ | |
docID += decodingTableHighBitsCarryover[cb]; | |
cb = compressedBuffer[highBitsPointer]; | |
UInt64 docIdNumber = decodingTableNumber[cb]; | |
for (UInt64 i = 0; i < docIdNumber; i++) | |
{ | |
docID <<= lowBitsCount; | |
docID |= lowBits & ((1u << lowBitsCount) - 1u); | |
while (lowBitsCount < lowBitsLength) | |
{ | |
docID <<= 8; | |
lowBits = compressedBuffer[lowBitsPointer++]; | |
docID |= lowBits; | |
lowBitsCount += 8; | |
} | |
lowBitsCount -= lowBitsLength; | |
docID >>= lowBitsCount; | |
docID += (decodingTableHighBits[cb, i] << lowBitsLength) + lastDocID + 1u; | |
decompressedUlongs[resultPointer++] = docID; | |
lastDocID = docID; | |
docID = 0; | |
} | |
} | |
return decompressedUlongs; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment