Created
October 4, 2025 04:06
-
-
Save akhenakh/ae19f0de9fa509df544d9daed18c604e to your computer and use it in GitHub Desktop.
A WIP for a kaitai.io describing a ZIM file https://www.openzim.org/wiki/ZIM_file_format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
meta: | |
id: zim | |
title: ZIM file format | |
file-extension: | |
- zim | |
- zimaa | |
xref: | |
justsolve: ZIM | |
wikidata: Q2474324 | |
license: CC-BY-SA-3.0 | |
endian: le | |
encoding: UTF-8 | |
doc: | | |
The ZIM file format is a format for offline content, like Wikipedia. | |
It is an open standard, and is based on a series of compressed "clusters" | |
that contain the content. | |
See https://openzim.org/wiki/ZIM_file_format | |
seq: | |
- id: header | |
type: header | |
instances: | |
mime_list: | |
pos: header.mime_list_pos | |
type: mime_list | |
doc: The MIME type list, which contains a series of null-terminated strings. | |
path_pointers: | |
pos: header.path_ptr_pos | |
type: u8 | |
repeat: expr | |
repeat-expr: header.entry_count | |
doc: A list of 8-byte offsets to directory entries, ordered by path. | |
title_pointers: | |
pos: header.title_ptr_pos | |
type: u4 | |
repeat: expr | |
repeat-expr: header.entry_count | |
doc: A list of 4-byte indices into the path_pointers list, ordered by title. | |
cluster_pointers: | |
pos: header.cluster_ptr_pos | |
type: u8 | |
repeat: expr | |
repeat-expr: header.cluster_count | |
doc: A list of 8-byte offsets to the data clusters. | |
# pattern for reading items from a list of offsets. | |
directory_entry_wrappers: | |
type: directory_entry_wrapper(_index) | |
repeat: expr | |
repeat-expr: header.entry_count | |
doc: An array of wrappers, each pointing to a lazily-loaded directory entry. | |
cluster_wrappers: | |
type: cluster_wrapper(_index) | |
repeat: expr | |
repeat-expr: header.cluster_count | |
doc: An array of wrappers, each pointing to a lazily-loaded cluster. | |
checksum: | |
pos: header.checksum_pos | |
size: 16 | |
doc: The MD5 checksum of the ZIM file, with this checksum field zeroed. | |
entry_by_title: | |
type: entry_by_title_helper(_index) | |
repeat: expr | |
repeat-expr: header.entry_count | |
doc: > | |
An array of helpers providing access to directory entries in title order. | |
To access an actual entry, use `entry_by_title[i].entry`. | |
types: | |
header: | |
seq: | |
- id: magic_number | |
contents: [0x5a, 0x49, 0x4d, 0x04] | |
- id: major_version | |
type: u2 | |
- id: minor_version | |
type: u2 | |
- id: uuid | |
size: 16 | |
- id: entry_count | |
type: u4 | |
- id: cluster_count | |
type: u4 | |
- id: path_ptr_pos | |
type: u8 | |
- id: title_ptr_pos | |
type: u8 | |
- id: cluster_ptr_pos | |
type: u8 | |
- id: mime_list_pos | |
type: u8 | |
- id: main_page | |
type: u4 | |
- id: layout_page | |
type: u4 | |
- id: checksum_pos | |
type: u8 | |
mime_list: | |
seq: | |
- id: items | |
type: strz | |
repeat: until | |
repeat-until: _.length == 0 | |
directory_entry_wrapper: | |
params: | |
- id: idx | |
type: u4 | |
instances: | |
entry: | |
pos: _root.path_pointers[idx] | |
type: directory_entry | |
directory_entry: | |
seq: | |
- id: mime_type_idx | |
type: u2 | |
- id: parameter_len | |
type: u1 | |
valid: 0 | |
- id: namespace | |
type: str | |
size: 1 | |
- id: revision | |
type: u4 | |
valid: 0 | |
- id: body_redirect | |
type: redirect_body | |
if: mime_type_idx == 0xffff | |
- id: body_content | |
type: content_body | |
if: mime_type_idx != 0xffff | |
- id: path | |
type: strz | |
- id: title | |
type: strz | |
- id: parameter | |
size: parameter_len | |
instances: | |
mime_type: | |
value: _root.mime_list.items[mime_type_idx] | |
if: mime_type_idx != 0xffff and mime_type_idx != 0xfffe and mime_type_idx != 0xfffd | |
content: | |
value: _root.cluster_wrappers[body_content.cluster_number].cluster.data.blobs[body_content.blob_number].body | |
if: mime_type_idx != 0xffff | |
redirect_body: | |
seq: | |
- id: redirect_index | |
type: u4 | |
content_body: | |
seq: | |
- id: cluster_number | |
type: u4 | |
- id: blob_number | |
type: u4 | |
cluster_wrapper: | |
params: | |
- id: idx | |
type: u4 | |
instances: | |
cluster: | |
pos: _root.cluster_pointers[idx] | |
size: > | |
(idx < (_root.header.cluster_count - 1)) ? | |
(_root.cluster_pointers[idx + 1] - _root.cluster_pointers[idx]) : | |
(_root.header.checksum_pos - _root.cluster_pointers[idx]) | |
type: cluster | |
cluster: | |
seq: | |
- id: info | |
type: u1 | |
- id: payload_uncompressed | |
type: cluster_data(info) | |
size-eos: true | |
if: compression == 1 | |
- id: payload_xz | |
type: processed_cluster_xz(info) | |
size-eos: true | |
if: compression == 4 | |
- id: payload_zstd | |
type: processed_cluster_zstd(info) | |
size-eos: true | |
if: compression == 5 | |
instances: | |
compression: | |
value: info & 0xf | |
data: | |
value: 'compression == 1 ? payload_uncompressed : (compression == 4 ? payload_xz.decompressed : payload_zstd.decompressed)' | |
processed_cluster_xz: | |
params: | |
- id: info | |
type: u1 | |
seq: | |
- id: decompressed | |
process: xz | |
size-eos: true | |
type: cluster_data(info) | |
processed_cluster_zstd: | |
params: | |
- id: info | |
type: u1 | |
seq: | |
- id: decompressed | |
process: zstd | |
size-eos: true | |
type: cluster_data(info) | |
cluster_data: | |
params: | |
- id: cluster_info | |
type: u1 | |
seq: | |
- id: first_offset4 | |
type: u4 | |
if: not is_extended | |
- id: first_offset8 | |
type: u8 | |
if: is_extended | |
- id: rest_offsets4 | |
type: u4 | |
repeat: expr | |
repeat-expr: (first_offset4 / 4) - 1 | |
if: not is_extended | |
- id: rest_offsets8 | |
type: u8 | |
repeat: expr | |
repeat-expr: (first_offset8 / 8) - 1 | |
if: is_extended | |
instances: | |
is_extended: | |
value: _root.header.major_version >= 6 and (cluster_info & 0b10000) != 0 | |
num_blobs: | |
value: 'is_extended ? (first_offset8 / 8) : (first_offset4 / 4)' | |
blobs: | |
type: blob(_index) | |
repeat: expr | |
repeat-expr: num_blobs | |
blob: | |
params: | |
- id: idx | |
type: u4 | |
instances: | |
pos: | |
value: 'idx == 0 ? (_parent.is_extended ? _parent.first_offset8 : _parent.first_offset4) : (_parent.is_extended ? _parent.rest_offsets8[idx - 1] : _parent.rest_offsets4[idx - 1])' | |
end_pos: | |
value: '_parent.is_extended ? _parent.rest_offsets8[idx] : _parent.rest_offsets4[idx]' | |
size: | |
value: end_pos - pos | |
body: | |
io: _parent._io | |
pos: pos | |
size: size | |
entry_by_title_helper: | |
params: | |
- id: idx | |
type: u4 | |
instances: | |
entry: | |
value: _root.directory_entry_wrappers[_root.title_pointers[idx]].entry |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment