Last active
June 19, 2018 04:23
-
-
Save catid/1e8e993e1581e92d96515a07c5ce4804 to your computer and use it in GitHub Desktop.
Extra huf_compress heuristic to speed up already-compressed or random data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Scan input and build symbol stats */ | |
{ CHECK_V_F(largest, FSE_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->count) ); | |
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ | |
if (largest <= (srcSize >> 7) + 1) { | |
return 0; /* heuristic : probably not compressible enough */ | |
} | |
/* Added this more accurate heuristic to handle high entropy data properly -catid */ | |
U64 total = 0; | |
U64 sum_clogc = 0; | |
unsigned nonzero_count = 0; | |
for (unsigned i = 0; i < maxSymbolValue; ++i) { | |
U32 const c_i = table->count[i]; | |
if (c_i == 0) { | |
continue; | |
} | |
total += c_i; | |
++nonzero_count; | |
unsigned const c_bits = BIT_highbit32(c_i) + 1; | |
sum_clogc += c_i * c_bits; | |
} | |
unsigned t_bits = 0; | |
U32 t_bits_input = (U32)total; | |
if (total >= 0x100000000ULL) { | |
t_bits = 32; | |
t_bits_input = (U32)(total >> 32); | |
} | |
t_bits += BIT_highbit32(t_bits_input) + 1; | |
unsigned estimate_bytes = (unsigned)((total * t_bits - sum_clogc) / 8); | |
if (nonzero_count > 64) { | |
estimate_bytes += nonzero_count / 4; | |
} | |
else { | |
estimate_bytes += nonzero_count / 2; | |
} | |
if (estimate_bytes > srcSize) { | |
return 0; /* heuristic : probably not compressible enough */ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It goes in huf_compress.c around like 680 here: