Created
April 3, 2024 05:37
-
-
Save graeme-winter/e40534f137189b777700b2006d5eeab0 to your computer and use it in GitHub Desktop.
Manually unpack bitshuffle / lz4 data in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Proof of work is computing the sha1 hash of the uncompressed and unshuffled | |
# data and comparing with the result of doing the operation on the h5py | |
# extracted data | |
import binascii | |
import hashlib | |
import struct | |
import sys | |
import lz4.block | |
import bitshuffle | |
import hdf5plugin | |
import numpy | |
import h5py | |
import tqdm | |
chunks = {} | |
hashes = {} | |
def bits(data): | |
# find the lz4 chunk sizes, count the number of blocks and return | |
l = len(data) | |
i = 0 | |
n = 0 | |
sha = hashlib.sha1() | |
while i < l: | |
x = struct.unpack(">I", data[i : i + 4])[0] | |
b = data[i + 4 : i + x + 4] | |
d = numpy.frombuffer( | |
lz4.block.decompress(b, uncompressed_size=8192), dtype=numpy.uint16 | |
) | |
sha.update(bitshuffle.bitunshuffle(d, 8192).tobytes()) | |
i += x + 4 | |
n += 1 | |
return n, binascii.hexlify(sha.digest()).decode() | |
with h5py.File(sys.argv[1]) as f: | |
d = f["data"] | |
n = d.id.get_num_chunks() | |
for i in tqdm.tqdm(range(n)): | |
chunk_info = d.id.get_chunk_info(i) | |
chunks[chunk_info.chunk_offset] = (chunk_info.byte_offset, chunk_info.size) | |
hashes[chunk_info.chunk_offset] = binascii.hexlify( | |
hashlib.sha1( | |
d[ | |
chunk_info.chunk_offset[0], chunk_info.chunk_offset[1], :, : | |
].tobytes() | |
).digest() | |
).decode() | |
with open(sys.argv[1], "rb") as f: | |
for i in tqdm.tqdm(sorted(chunks)): | |
c = chunks[i] | |
f.seek(c[0]) | |
x = f.read(12) | |
b = f.read(c[1] - 12) | |
s = struct.unpack(">QI", x) | |
assert s == (1052672, 8192) | |
b0, b1 = bits(b) | |
assert b0 == 1 + (s[0] // s[1]) | |
assert b1 == hashes[i] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment