matomatical · January 3, 2025 10:21 · matomatical · Dec 23, 2024 · matomatical · Dec 23, 2024
diff --git a/parse_wandb_database.py b/parse_wandb_database.py
 """
 Extract the contents of a `run-{id}.wandb` database file.

 These database files are stored in a custom binary format. Namely, the database
 is a sequence of wandb 'records' (each representing a logging event of some
 type, including compute stats, program outputs, experimental metrics, wandb
 telemetry, and various other things). Within these records, some data values
 are encoded with json. Each record is encoded with protobuf and stored over one
 or more blocks in a LevelDB log. The result is the binary .wandb database file.

 Note that the wandb sdk internally contains all the code necessary to read
 these files. It parses the LevelDB chunked records in the process of syncing to
 the cloud, and will show you the results if you run the command
    `wandb sync --view --verbose run-{id}.wandb`.
 And the protobuf scheme is included with the sdk (for serialisation) so this
 can also be used for deserialisation.

 This is a simple script that leans on the wandb internals to invert the
 encoding process and render the contents of the .wandb database as a pure
 Python object. From here, it should be possible to, for example, aggregate all
 of the metrics logged across a run and plot your own learning curves, without
 having to go through the excruciating (and hereby redundant) process of
 downloading this data from the cloud via the API.

 Notes:

 * The protobuf scheme for the retrieved records is defined here:
    [https://github.com/wandb/wandb/blob/main/wandb/proto/wandb_internal.proto].
  This is useful for understanding the structure of the Python object returned
  by this tool.
 * The LevelDB log format is documented here:
    [https://github.com/google/leveldb/blob/main/doc/log_format.md],
  but note that the W&B SDK does not depend on the LevelDB SDK, it just uses
  the same file structure. The reason for this seems to be a different choice
  of checksum algorithm from the LevelDB standard.
 * This starting point for this tool was the discussion at this issue:
    https://github.com/wandb/wandb/issues/1768
  supplemented by my study of the wandb sdk source code.
 * I've mostly been studying `wandb/wandb/sdk/internal/datastore.py` to
  understand the writer, but I recently noticed that if one uses wandb core,
  it seems to use a different implementation.

 Caveats:

 * The technique depends on some of the internals of the wandb library not
  guaranteed to be preserved across versions. This script might therefore break
  at any time. Hopefully before then wandb developers provide an official way
  for users to access their own offline experimental data without roundtripping
  through their cloud platform.
 * The script doesn't include error handling. It's plausible that it will break
  on corrupted or old databases. You'll have to deal with that when the time
  comes.
  * OK, the time has come for me to deal with it because lots of my data is
    apparently corrupted (or was for some reason written not respecting the
    LevelDB format properly). I will try to patch the SDK so parsing can
    recover from the next valid protobuf record.
 """

 import collections
 import dataclasses

 import json
 import google.protobuf.json_format as protobuf_json
 import wandb


 @dataclasses.dataclass(frozen=True)
 class WeightsAndBiasesDatabase:
    history:              tuple = ()
    summary:              tuple = ()
    output:               tuple = ()
    config:               tuple = ()
    files:                tuple = ()
    stats:                tuple = ()
    artifact:             tuple = ()
    tbrecord:             tuple = ()
    alert:                tuple = ()
    telemetry:            tuple = ()
    metric:               tuple = ()
    output_raw:           tuple = ()
    run:                  tuple = ()
    exit:                 tuple = ()
    final:                tuple = ()
    header:               tuple = ()
    footer:               tuple = ()
    preempting:           tuple = ()
    noop_link_artifact:   tuple = ()
    use_artifact:         tuple = ()
    request:              tuple = ()


 def load_wandb_database(path: str) -> WeightsAndBiasesDatabase:
    """
    Convert a wandb database at `path` into a stream of records of each type.
    """
    d = collections.defaultdict(list)

    # use wandb's internal reader class to parse 
    ds = wandb.sdk.internal.datastore.DataStore()

    # point the reader at the .wandb file.
    ds.open_for_scan(path)
    
    # iteratively call scan_data(), which aggregates leveldb records into a
    # single protobuf record, or returns None at the end of the file.
    while True:
        record_bin = ds.scan_data()
        if record_bin is None: break # end of file

        # once we have the data, we need to parse it into a protobuf struct
        record_pb = wandb.proto.wandb_internal_pb2.Record()
        record_pb.ParseFromString(record_bin)

        # convert to a python dictionary
        record_dict = protobuf_json.MessageToDict(
            record_pb,
            preserving_proto_field_name=True,
        )
        # strip away the aux fields (num, control, etc.) and get to the data
        record_type = record_pb.WhichOneof("record_type")
        data_dict = record_dict[record_type]

        # replace any [{key: k, value_json: json(v)}] with {k: v}:
        for field, items in data_dict.items():
            if isinstance(items, list) and items and 'value_json' in items[0]:
                mapping = {}
                for item in items:
                    if 'key' in item:
                        key = item['key']
                    else: # 'nested_key' in item:
                        key = '/'.join(item['nested_key'])
                    assert key not in mapping
                    value = json.loads(item['value_json'])
                    mapping[key] = value
                data_dict[field] = mapping
        
        # append this record to the appropriate list for that record type
        d[record_type].append(data_dict)

    return WeightsAndBiasesDatabase(**d)


 if __name__ == "__main__":
    import sys
    if len(sys.argv) != 2:
        print(
            "usage: parse_wandb_database.py path/to/run.wandb",
            file=sys.stderr,
        )
        sys.exit(1)
    path = sys.argv[1]
    print(f"loading wandb database from {path}...")
    wdb = load_wandb_database(path)
    print("loaded!")
    for record_type, record_data in wdb.__getstate__().items():
        print(f"  {record_type+':':21} {len(record_data): 6d} records")
	"""
	Extract the contents of a `run-{id}.wandb` database file.

	These database files are stored in a custom binary format. Namely, the database
	is a sequence of wandb 'records' (each representing a logging event of some
	type, including compute stats, program outputs, experimental metrics, wandb
	telemetry, and various other things). Within these records, some data values
	are encoded with json. Each record is encoded with protobuf and stored over one
	or more blocks in a LevelDB log. The result is the binary .wandb database file.

	Note that the wandb sdk internally contains all the code necessary to read
	these files. It parses the LevelDB chunked records in the process of syncing to
	the cloud, and will show you the results if you run the command
	`wandb sync --view --verbose run-{id}.wandb`.
	And the protobuf scheme is included with the sdk (for serialisation) so this
	can also be used for deserialisation.

	This is a simple script that leans on the wandb internals to invert the
	encoding process and render the contents of the .wandb database as a pure
	Python object. From here, it should be possible to, for example, aggregate all
	of the metrics logged across a run and plot your own learning curves, without
	having to go through the excruciating (and hereby redundant) process of
	downloading this data from the cloud via the API.

	Notes:

	* The protobuf scheme for the retrieved records is defined here:
	[https://github.com/wandb/wandb/blob/main/wandb/proto/wandb_internal.proto].
	This is useful for understanding the structure of the Python object returned
	by this tool.
	* The LevelDB log format is documented here:
	[https://github.com/google/leveldb/blob/main/doc/log_format.md],
	but note that the W&B SDK does not depend on the LevelDB SDK, it just uses
	the same file structure. The reason for this seems to be a different choice
	of checksum algorithm from the LevelDB standard.
	* This starting point for this tool was the discussion at this issue:
	https://github.com/wandb/wandb/issues/1768
	supplemented by my study of the wandb sdk source code.
	* I've mostly been studying `wandb/wandb/sdk/internal/datastore.py` to
	understand the writer, but I recently noticed that if one uses wandb core,
	it seems to use a different implementation.

	Caveats:

	* The technique depends on some of the internals of the wandb library not
	guaranteed to be preserved across versions. This script might therefore break
	at any time. Hopefully before then wandb developers provide an official way
	for users to access their own offline experimental data without roundtripping
	through their cloud platform.
	* The script doesn't include error handling. It's plausible that it will break
	on corrupted or old databases. You'll have to deal with that when the time
	comes.
	* OK, the time has come for me to deal with it because lots of my data is
	apparently corrupted (or was for some reason written not respecting the
	LevelDB format properly). I will try to patch the SDK so parsing can
	recover from the next valid protobuf record.
	"""

	import collections
	import dataclasses

	import json
	import google.protobuf.json_format as protobuf_json
	import wandb


	@dataclasses.dataclass(frozen=True)
	class WeightsAndBiasesDatabase:
	history: tuple = ()
	summary: tuple = ()
	output: tuple = ()
	config: tuple = ()
	files: tuple = ()
	stats: tuple = ()
	artifact: tuple = ()
	tbrecord: tuple = ()
	alert: tuple = ()
	telemetry: tuple = ()
	metric: tuple = ()
	output_raw: tuple = ()
	run: tuple = ()
	exit: tuple = ()
	final: tuple = ()
	header: tuple = ()
	footer: tuple = ()
	preempting: tuple = ()
	noop_link_artifact: tuple = ()
	use_artifact: tuple = ()
	request: tuple = ()


	def load_wandb_database(path: str) -> WeightsAndBiasesDatabase:
	"""
	Convert a wandb database at `path` into a stream of records of each type.
	"""
	d = collections.defaultdict(list)

	# use wandb's internal reader class to parse
	ds = wandb.sdk.internal.datastore.DataStore()

	# point the reader at the .wandb file.
	ds.open_for_scan(path)

	# iteratively call scan_data(), which aggregates leveldb records into a
	# single protobuf record, or returns None at the end of the file.
	while True:
	record_bin = ds.scan_data()
	if record_bin is None: break # end of file

	# once we have the data, we need to parse it into a protobuf struct
	record_pb = wandb.proto.wandb_internal_pb2.Record()
	record_pb.ParseFromString(record_bin)

	# convert to a python dictionary
	record_dict = protobuf_json.MessageToDict(
	record_pb,
	preserving_proto_field_name=True,
	)
	# strip away the aux fields (num, control, etc.) and get to the data
	record_type = record_pb.WhichOneof("record_type")
	data_dict = record_dict[record_type]

	# replace any [{key: k, value_json: json(v)}] with {k: v}:
	for field, items in data_dict.items():
	if isinstance(items, list) and items and 'value_json' in items[0]:
	mapping = {}
	for item in items:
	if 'key' in item:
	key = item['key']
	else: # 'nested_key' in item:
	key = '/'.join(item['nested_key'])
	assert key not in mapping
	value = json.loads(item['value_json'])
	mapping[key] = value
	data_dict[field] = mapping

	# append this record to the appropriate list for that record type
	d[record_type].append(data_dict)

	return WeightsAndBiasesDatabase(**d)


	if __name__ == "__main__":
	import sys
	if len(sys.argv) != 2:
	print(
	"usage: parse_wandb_database.py path/to/run.wandb",
	file=sys.stderr,
	)
	sys.exit(1)
	path = sys.argv[1]
	print(f"loading wandb database from {path}...")
	wdb = load_wandb_database(path)
	print("loaded!")
	for record_type, record_data in wdb.__getstate__().items():
	print(f" {record_type+':':21} {len(record_data): 6d} records")