akhenakh · October 4, 2025 04:06
diff --git a/zim.yaml b/zim.yaml
 meta:
  id: zim
  title: ZIM file format
  file-extension:
    - zim
    - zimaa
  xref:
    justsolve: ZIM
    wikidata: Q2474324
  license: CC-BY-SA-3.0
  endian: le
  encoding: UTF-8
 doc: |
  The ZIM file format is a format for offline content, like Wikipedia.
  It is an open standard, and is based on a series of compressed "clusters"
  that contain the content.
  See https://openzim.org/wiki/ZIM_file_format

 seq:
  - id: header
    type: header

 instances:
  mime_list:
    pos: header.mime_list_pos
    type: mime_list
    doc: The MIME type list, which contains a series of null-terminated strings.
  path_pointers:
    pos: header.path_ptr_pos
    type: u8
    repeat: expr
    repeat-expr: header.entry_count
    doc: A list of 8-byte offsets to directory entries, ordered by path.
  title_pointers:
    pos: header.title_ptr_pos
    type: u4
    repeat: expr
    repeat-expr: header.entry_count
    doc: A list of 4-byte indices into the path_pointers list, ordered by title.
  cluster_pointers:
    pos: header.cluster_ptr_pos
    type: u8
    repeat: expr
    repeat-expr: header.cluster_count
    doc: A list of 8-byte offsets to the data clusters.

  # pattern for reading items from a list of offsets.
  directory_entry_wrappers:
    type: directory_entry_wrapper(_index)
    repeat: expr
    repeat-expr: header.entry_count
    doc: An array of wrappers, each pointing to a lazily-loaded directory entry.
  
  cluster_wrappers:
    type: cluster_wrapper(_index)
    repeat: expr
    repeat-expr: header.cluster_count
    doc: An array of wrappers, each pointing to a lazily-loaded cluster.
  
  checksum:
    pos: header.checksum_pos
    size: 16
    doc: The MD5 checksum of the ZIM file, with this checksum field zeroed.
    
  entry_by_title:
    type: entry_by_title_helper(_index)
    repeat: expr
    repeat-expr: header.entry_count
    doc: >
      An array of helpers providing access to directory entries in title order.
      To access an actual entry, use `entry_by_title[i].entry`.

 types:
  header:
    seq:
      - id: magic_number
        contents: [0x5a, 0x49, 0x4d, 0x04]
      - id: major_version
        type: u2
      - id: minor_version
        type: u2
      - id: uuid
        size: 16
      - id: entry_count
        type: u4
      - id: cluster_count
        type: u4
      - id: path_ptr_pos
        type: u8
      - id: title_ptr_pos
        type: u8
      - id: cluster_ptr_pos
        type: u8
      - id: mime_list_pos
        type: u8
      - id: main_page
        type: u4
      - id: layout_page
        type: u4
      - id: checksum_pos
        type: u8

  mime_list:
    seq:
      - id: items
        type: strz
        repeat: until
        repeat-until: _.length == 0

  directory_entry_wrapper:
    params:
      - id: idx
        type: u4
    instances:
      entry:
        pos: _root.path_pointers[idx]
        type: directory_entry

  directory_entry:
    seq:
      - id: mime_type_idx
        type: u2
      - id: parameter_len
        type: u1
        valid: 0
      - id: namespace
        type: str
        size: 1
      - id: revision
        type: u4
        valid: 0
      - id: body_redirect
        type: redirect_body
        if: mime_type_idx == 0xffff
      - id: body_content
        type: content_body
        if: mime_type_idx != 0xffff
      - id: path
        type: strz
      - id: title
        type: strz
      - id: parameter
        size: parameter_len
    instances:
      mime_type:
        value: _root.mime_list.items[mime_type_idx]
        if: mime_type_idx != 0xffff and mime_type_idx != 0xfffe and mime_type_idx != 0xfffd
      content:
        value: _root.cluster_wrappers[body_content.cluster_number].cluster.data.blobs[body_content.blob_number].body
        if: mime_type_idx != 0xffff

  redirect_body:
    seq:
      - id: redirect_index
        type: u4

  content_body:
    seq:
      - id: cluster_number
        type: u4
      - id: blob_number
        type: u4

  cluster_wrapper:
    params:
      - id: idx
        type: u4
    instances:
      cluster:
        pos: _root.cluster_pointers[idx]
        size: >
          (idx < (_root.header.cluster_count - 1)) ?
          (_root.cluster_pointers[idx + 1] - _root.cluster_pointers[idx]) :
          (_root.header.checksum_pos - _root.cluster_pointers[idx])
        type: cluster

  cluster:
    seq:
      - id: info
        type: u1
      - id: payload_uncompressed
        type: cluster_data(info)
        size-eos: true
        if: compression == 1
      - id: payload_xz
        type: processed_cluster_xz(info)
        size-eos: true
        if: compression == 4
      - id: payload_zstd
        type: processed_cluster_zstd(info)
        size-eos: true
        if: compression == 5
    instances:
      compression:
        value: info & 0xf
      data:
        value: 'compression == 1 ? payload_uncompressed : (compression == 4 ? payload_xz.decompressed : payload_zstd.decompressed)'

  processed_cluster_xz:
    params:
      - id: info
        type: u1
    seq:
      - id: decompressed
        process: xz
        size-eos: true
        type: cluster_data(info)

  processed_cluster_zstd:
    params:
      - id: info
        type: u1
    seq:
      - id: decompressed
        process: zstd
        size-eos: true
        type: cluster_data(info)

  cluster_data:
    params:
      - id: cluster_info
        type: u1
    seq:
      - id: first_offset4
        type: u4
        if: not is_extended
      - id: first_offset8
        type: u8
        if: is_extended
      - id: rest_offsets4
        type: u4
        repeat: expr
        repeat-expr: (first_offset4 / 4) - 1
        if: not is_extended
      - id: rest_offsets8
        type: u8
        repeat: expr
        repeat-expr: (first_offset8 / 8) - 1
        if: is_extended
    instances:
      is_extended:
        value: _root.header.major_version >= 6 and (cluster_info & 0b10000) != 0
      num_blobs:
        value: 'is_extended ? (first_offset8 / 8) : (first_offset4 / 4)'
      blobs:
        type: blob(_index)
        repeat: expr
        repeat-expr: num_blobs

  blob:
    params:
      - id: idx
        type: u4
    instances:
      pos:
        value: 'idx == 0 ? (_parent.is_extended ? _parent.first_offset8 : _parent.first_offset4) : (_parent.is_extended ? _parent.rest_offsets8[idx - 1] : _parent.rest_offsets4[idx - 1])'
      end_pos:
        value: '_parent.is_extended ? _parent.rest_offsets8[idx] : _parent.rest_offsets4[idx]'
      size:
        value: end_pos - pos
      body:
        io: _parent._io
        pos: pos
        size: size

  entry_by_title_helper:
    params:
      - id: idx
        type: u4
    instances:
      entry:
        value: _root.directory_entry_wrappers[_root.title_pointers[idx]].entry
	meta:
	id: zim
	title: ZIM file format
	file-extension:
	- zim
	- zimaa
	xref:
	justsolve: ZIM
	wikidata: Q2474324
	license: CC-BY-SA-3.0
	endian: le
	encoding: UTF-8
	doc: \|
	The ZIM file format is a format for offline content, like Wikipedia.
	It is an open standard, and is based on a series of compressed "clusters"
	that contain the content.
	See https://openzim.org/wiki/ZIM_file_format

	seq:
	- id: header
	type: header

	instances:
	mime_list:
	pos: header.mime_list_pos
	type: mime_list
	doc: The MIME type list, which contains a series of null-terminated strings.
	path_pointers:
	pos: header.path_ptr_pos
	type: u8
	repeat: expr
	repeat-expr: header.entry_count
	doc: A list of 8-byte offsets to directory entries, ordered by path.
	title_pointers:
	pos: header.title_ptr_pos
	type: u4
	repeat: expr
	repeat-expr: header.entry_count
	doc: A list of 4-byte indices into the path_pointers list, ordered by title.
	cluster_pointers:
	pos: header.cluster_ptr_pos
	type: u8
	repeat: expr
	repeat-expr: header.cluster_count
	doc: A list of 8-byte offsets to the data clusters.

	# pattern for reading items from a list of offsets.
	directory_entry_wrappers:
	type: directory_entry_wrapper(_index)
	repeat: expr
	repeat-expr: header.entry_count
	doc: An array of wrappers, each pointing to a lazily-loaded directory entry.

	cluster_wrappers:
	type: cluster_wrapper(_index)
	repeat: expr
	repeat-expr: header.cluster_count
	doc: An array of wrappers, each pointing to a lazily-loaded cluster.

	checksum:
	pos: header.checksum_pos
	size: 16
	doc: The MD5 checksum of the ZIM file, with this checksum field zeroed.

	entry_by_title:
	type: entry_by_title_helper(_index)
	repeat: expr
	repeat-expr: header.entry_count
	doc: >
	An array of helpers providing access to directory entries in title order.
	To access an actual entry, use `entry_by_title[i].entry`.

	types:
	header:
	seq:
	- id: magic_number
	contents: [0x5a, 0x49, 0x4d, 0x04]
	- id: major_version
	type: u2
	- id: minor_version
	type: u2
	- id: uuid
	size: 16
	- id: entry_count
	type: u4
	- id: cluster_count
	type: u4
	- id: path_ptr_pos
	type: u8
	- id: title_ptr_pos
	type: u8
	- id: cluster_ptr_pos
	type: u8
	- id: mime_list_pos
	type: u8
	- id: main_page
	type: u4
	- id: layout_page
	type: u4
	- id: checksum_pos
	type: u8

	mime_list:
	seq:
	- id: items
	type: strz
	repeat: until
	repeat-until: _.length == 0

	directory_entry_wrapper:
	params:
	- id: idx
	type: u4
	instances:
	entry:
	pos: _root.path_pointers[idx]
	type: directory_entry

	directory_entry:
	seq:
	- id: mime_type_idx
	type: u2
	- id: parameter_len
	type: u1
	valid: 0
	- id: namespace
	type: str
	size: 1
	- id: revision
	type: u4
	valid: 0
	- id: body_redirect
	type: redirect_body
	if: mime_type_idx == 0xffff
	- id: body_content
	type: content_body
	if: mime_type_idx != 0xffff
	- id: path
	type: strz
	- id: title
	type: strz
	- id: parameter
	size: parameter_len
	instances:
	mime_type:
	value: _root.mime_list.items[mime_type_idx]
	if: mime_type_idx != 0xffff and mime_type_idx != 0xfffe and mime_type_idx != 0xfffd
	content:
	value: _root.cluster_wrappers[body_content.cluster_number].cluster.data.blobs[body_content.blob_number].body
	if: mime_type_idx != 0xffff

	redirect_body:
	seq:
	- id: redirect_index
	type: u4

	content_body:
	seq:
	- id: cluster_number
	type: u4
	- id: blob_number
	type: u4

	cluster_wrapper:
	params:
	- id: idx
	type: u4
	instances:
	cluster:
	pos: _root.cluster_pointers[idx]
	size: >
	(idx < (_root.header.cluster_count - 1)) ?
	(_root.cluster_pointers[idx + 1] - _root.cluster_pointers[idx]) :
	(_root.header.checksum_pos - _root.cluster_pointers[idx])
	type: cluster

	cluster:
	seq:
	- id: info
	type: u1
	- id: payload_uncompressed
	type: cluster_data(info)
	size-eos: true
	if: compression == 1
	- id: payload_xz
	type: processed_cluster_xz(info)
	size-eos: true
	if: compression == 4
	- id: payload_zstd
	type: processed_cluster_zstd(info)
	size-eos: true
	if: compression == 5
	instances:
	compression:
	value: info & 0xf
	data:
	value: 'compression == 1 ? payload_uncompressed : (compression == 4 ? payload_xz.decompressed : payload_zstd.decompressed)'

	processed_cluster_xz:
	params:
	- id: info
	type: u1
	seq:
	- id: decompressed
	process: xz
	size-eos: true
	type: cluster_data(info)

	processed_cluster_zstd:
	params:
	- id: info
	type: u1
	seq:
	- id: decompressed
	process: zstd
	size-eos: true
	type: cluster_data(info)

	cluster_data:
	params:
	- id: cluster_info
	type: u1
	seq:
	- id: first_offset4
	type: u4
	if: not is_extended
	- id: first_offset8
	type: u8
	if: is_extended
	- id: rest_offsets4
	type: u4
	repeat: expr
	repeat-expr: (first_offset4 / 4) - 1
	if: not is_extended
	- id: rest_offsets8
	type: u8
	repeat: expr
	repeat-expr: (first_offset8 / 8) - 1
	if: is_extended
	instances:
	is_extended:
	value: _root.header.major_version >= 6 and (cluster_info & 0b10000) != 0
	num_blobs:
	value: 'is_extended ? (first_offset8 / 8) : (first_offset4 / 4)'
	blobs:
	type: blob(_index)
	repeat: expr
	repeat-expr: num_blobs

	blob:
	params:
	- id: idx
	type: u4
	instances:
	pos:
	value: 'idx == 0 ? (_parent.is_extended ? _parent.first_offset8 : _parent.first_offset4) : (_parent.is_extended ? _parent.rest_offsets8[idx - 1] : _parent.rest_offsets4[idx - 1])'
	end_pos:
	value: '_parent.is_extended ? _parent.rest_offsets8[idx] : _parent.rest_offsets4[idx]'
	size:
	value: end_pos - pos
	body:
	io: _parent._io
	pos: pos
	size: size

	entry_by_title_helper:
	params:
	- id: idx
	type: u4
	instances:
	entry:
	value: _root.directory_entry_wrappers[_root.title_pointers[idx]].entry
No results found