Skip to content

Instantly share code, notes, and snippets.

@mateon1
Last active July 14, 2021 12:30
Show Gist options
  • Save mateon1/4b9b81b4e18de030dfca25a7ca798946 to your computer and use it in GitHub Desktop.
Save mateon1/4b9b81b4e18de030dfca25a7ca798946 to your computer and use it in GitHub Desktop.
zstd block format with ugly workarounds
meta:
id: zstd
title: Zstandard compression format
file-extension: zst
xref:
justsolve: Zstandard
mime: application/zstd
rfc: 8478
wikidata: Q105853477
endian: le
doc-ref: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
seq:
- id: frames
type: any_frame
repeat: eos
types:
zstd_frame:
meta:
bit-endian: be
seq:
- id: magic
contents: [0x28, 0xB5, 0x2F, 0xFD]
- id: frame_content_size_flag
type: b2
- id: single_segment_flag
type: b1
- id: unused_1
type: b1
- id: reserved_1
type: b1
valid: false
- id: content_checksum_flag
type: b1
- id: dictionary_id_flag
type: b2
- id: window_descriptor_exponent
type: b5
if: not single_segment_flag
- id: window_descriptor_mantissa
type: b3
if: not single_segment_flag
- id: dictionary_id
type:
switch-on: did_field_size
cases:
1: u1
2: u2
4: u4
if: did_field_size > 0
- id: frame_content_size
type:
switch-on: fcs_field_size
cases:
1: u1
2: u2
4: u4
8: u8
if: fcs_field_size > 0
- id: blocks
type: block
repeat: until
repeat-until: _.last_block
- id: content_checksum
type: u4
if: content_checksum_flag
instances:
fcs_field_size:
value: "frame_content_size_flag != 0 ? (1 << frame_content_size_flag) : (single_segment_flag ? 1 : 0)"
fcs_value:
value: "fcs_field_size == 2 ? frame_content_size + 256 : frame_content_size"
if: fcs_field_size > 0
did_field_size:
value: "dictionary_id_flag != 0 ? 1 << (dictionary_id_flag - 1) : 0"
window_log:
value: 10 + window_descriptor_exponent
if: not single_segment_flag
# broken because everything is 32 bits, but we need 64 to store this
#window_base:
# value: 1 << window_log
#window_add:
# value: (window_base/8) * window_mantissa
#window_size:
# value: window_base + window_add
block:
meta:
bit-endian: le
seq:
- id: last_block
type: b1
- id: block_type
type: b2
enum: block_type
valid:
expr: _ != block_type::reserved
- id: block_size
type: b21
- id: data
size: "block_type == block_type::rle ? 1 : block_size"
type:
switch-on: block_type
cases:
"block_type::compressed": compressed_block
compressed_block:
meta:
bit-endian: le
seq:
- id: literals_alignment_hack
type: literals_alignment_hack
- id: regen_alignment_hack
type: regen_alignment_hack(regenerated_size_bits)
- id: compressed_size
type:
switch-on: compressed_size_bits
cases:
10: b10
14: b14
18: b18
if: always_long_format
# this should always be a byte boundary
- id: literals_data
size: literals_size
- id: seq_header_byte0
type: u1
- id: num_sequences_remainder
type:
switch-on: num_sequences_len
cases:
1: b0
2: u1
3: u2
- id: reserved
type: b2
valid: 0
- id: match_lengths_mode
type: b2
enum: compression_mode
- id: offsets_mode
type: b2
enum: compression_mode
- id: literals_lengths_mode
type: b2
enum: compression_mode
- id: sequences_data
size-eos: true
instances:
literals_block_type:
value: literals_alignment_hack.literals_block_type
always_long_format:
value: literals_alignment_hack.always_long_format
size_format:
value: "
(literals_alignment_hack.size_format_hi.to_i << 1)
| literals_alignment_hack.size_format_lo.to_i"
regenerated_size_bits:
value: "
always_long_format ? (
size_format == 3 ? 18 :
size_format == 2 ? 14 : 10)
: (
size_format == 0 ? 5 :
size_format == 1 ? 12 : 20)"
compressed_size_bits:
value: "
size_format == 3 ? 18 :
size_format == 2 ? 14 : 10"
if: always_long_format
regenerated_size:
value: regen_alignment_hack.regenerated_size
literals_size:
value: "
literals_block_type == literals_block_type::raw ?
regenerated_size :
literals_block_type == literals_block_type::rle ?
1 :
compressed_size"
num_sequences_len:
value: "seq_header_byte0 < 128 ? 1 : seq_header_byte0 < 255 ? 2 : 3"
num_sequences:
value: "
num_sequences_len == 1 ?
seq_header_byte0 :
num_sequences_len == 2 ?
((seq_header_byte0-128) << 8) | num_sequences_remainder :
num_sequences_remainder + 0x7F00"
literals_alignment_hack:
meta:
bit-endian: le
seq:
- id: literals_block_type
type: b2
enum: literals_block_type
- id: size_format_lo
type: b1
- id: size_format_hi
type: b1
if: always_long_format or size_format_lo
instances:
always_long_format:
value: "literals_block_type == literals_block_type::compressed or
literals_block_type == literals_block_type::treeless"
regen_alignment_hack:
meta:
bit-endian: le
params:
- id: regenerated_size_bits
type: u1
seq:
- id: regenerated_size
type:
switch-on: regenerated_size_bits
cases:
5: b5
10: b10
12: b12
14: b14
18: b18
20: b20
skippable_frame:
seq:
- id: magic1
type: u1
valid:
expr: _ | 0xF == 0x5F
- id: magic2
contents: [0x2A, 0x4D, 0x18]
- id: frame_size
type: u4
- id: user_data
size: frame_size
any_frame:
instances:
first_byte:
pos: _io.pos
type: u1
valid:
expr: _ == 0x28 or _ | 0xF == 0x5F
seq:
- id: frame
type:
switch-on: first_byte
cases:
0x28: zstd_frame
_: skippable_frame
enums:
block_type:
0: raw
1: rle
2: compressed
3: reserved
literals_block_type:
0: raw
1: rle
2: compressed
3: treeless
compression_mode:
0: predefined
1: rle
2: fse
3: repeat
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment