Skip to content

Instantly share code, notes, and snippets.

@crusaderky
Created December 20, 2024 16:01
Show Gist options
  • Save crusaderky/b91549221447e966fb2b22c5177df724 to your computer and use it in GitHub Desktop.
Save crusaderky/b91549221447e966fb2b22c5177df724 to your computer and use it in GitHub Desktop.
versioned-hdf5 demo dataset builder
import argparse
import h5py
import numpy as np
from versioned_hdf5 import VersionedHDF5File
def gen_string(rs, j, k, dt):
return (
rs.randint(ord("A"), ord("Z"), dt.itemsize * j * k, dtype="uint8")
.view(np.dtype(("S", dt.itemsize)))
.astype(dt)
.reshape((j, k))
)
def gen_int(rs, j, k, dt):
return rs.randint(0, np.iinfo(dt).max, size=j * k, dtype=dt).reshape((j, k))
j, k = 1, 54436
value_axes_full = {
"lorem": (gen_string, np.dtype("S10")),
"ipsum": (gen_int, np.dtype("int32")),
"dolor": (gen_int, np.dtype("int64")),
"sit": (gen_int, np.dtype("int32")),
"amet": (gen_int, np.dtype("int64")),
"consectetur": (gen_string, np.dtype("S20")),
"adipiscing": (gen_string, np.dtype("S40")),
"elit": (gen_int, np.dtype("int64")),
"sed": (gen_int, np.dtype("int64")),
"do": (gen_int, np.dtype("int64")),
"eiusmod": (gen_int, np.dtype("int64")),
"tempor": (gen_int, np.dtype("int64")),
"incididunt": (gen_int, np.dtype("int64")),
"ut": (gen_int, np.dtype("int64")),
"labore": (gen_int, np.dtype("int64")),
"et": (gen_int, np.dtype("int64")),
"dolore": (gen_int, np.dtype("int64")),
"magna": (gen_int, np.dtype("int64")),
"aliqua": (gen_int, np.dtype("int64")),
}
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("fname")
parser.add_argument("-n", "--versions", type=int, default=101)
parser.add_argument("-l", "--levels", type=int, default=5)
parser.add_argument("-a", "--axes", type=int, default=len(value_axes_full))
parser.add_argument("-p", "--print-datasets", action="store_true")
args = parser.parse_args()
if args.axes > len(value_axes_full):
raise ValueError(f"Can't have more than {len(value_axes_full)} axes")
value_axes = dict(list(value_axes_full.items())[: args.axes])
print(0, end=" ", flush=True)
with h5py.File(args.fname, "w") as f:
vf = VersionedHDF5File(f)
with vf.stage_version("r0") as sv:
values = sv.create_group("data").create_group("values")
for level in range(args.levels):
level_values = values.create_group(str(level))
for value_axis, (gen, dt) in value_axes.items():
rs = np.random.RandomState(0)
level_values.create_dataset(
value_axis,
shape=(0, j, k),
maxshape=(None, None, None),
dtype=dt,
chunks=(10, 1, 1250),
)
for i in range(1, args.versions):
print(i, end=" ", flush=True)
with h5py.File(args.fname, "r+") as f:
vf = VersionedHDF5File(f)
with vf.stage_version(f"r{i}") as sv:
for level in range(args.levels):
for value_axis, (gen, dt) in value_axes.items():
# reuse data across versions by reusing RandomState
rs = np.random.RandomState(i % 4)
ds = sv["data/values"][str(level)][value_axis]
ds.resize((i, j, k))
a = gen(rs, j, k, dt)
ds[-1] = a
print()
if args.print_datasets:
with h5py.File(args.fname, "r") as f:
def print_datasets(name, obj):
if isinstance(obj, h5py.Dataset):
print(name)
f.visititems(print_datasets)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment