Created
December 20, 2024 16:01
-
-
Save crusaderky/b91549221447e966fb2b22c5177df724 to your computer and use it in GitHub Desktop.
versioned-hdf5 demo dataset builder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import h5py | |
import numpy as np | |
from versioned_hdf5 import VersionedHDF5File | |
def gen_string(rs, j, k, dt): | |
return ( | |
rs.randint(ord("A"), ord("Z"), dt.itemsize * j * k, dtype="uint8") | |
.view(np.dtype(("S", dt.itemsize))) | |
.astype(dt) | |
.reshape((j, k)) | |
) | |
def gen_int(rs, j, k, dt): | |
return rs.randint(0, np.iinfo(dt).max, size=j * k, dtype=dt).reshape((j, k)) | |
j, k = 1, 54436 | |
value_axes_full = { | |
"lorem": (gen_string, np.dtype("S10")), | |
"ipsum": (gen_int, np.dtype("int32")), | |
"dolor": (gen_int, np.dtype("int64")), | |
"sit": (gen_int, np.dtype("int32")), | |
"amet": (gen_int, np.dtype("int64")), | |
"consectetur": (gen_string, np.dtype("S20")), | |
"adipiscing": (gen_string, np.dtype("S40")), | |
"elit": (gen_int, np.dtype("int64")), | |
"sed": (gen_int, np.dtype("int64")), | |
"do": (gen_int, np.dtype("int64")), | |
"eiusmod": (gen_int, np.dtype("int64")), | |
"tempor": (gen_int, np.dtype("int64")), | |
"incididunt": (gen_int, np.dtype("int64")), | |
"ut": (gen_int, np.dtype("int64")), | |
"labore": (gen_int, np.dtype("int64")), | |
"et": (gen_int, np.dtype("int64")), | |
"dolore": (gen_int, np.dtype("int64")), | |
"magna": (gen_int, np.dtype("int64")), | |
"aliqua": (gen_int, np.dtype("int64")), | |
} | |
def main() -> None: | |
parser = argparse.ArgumentParser() | |
parser.add_argument("fname") | |
parser.add_argument("-n", "--versions", type=int, default=101) | |
parser.add_argument("-l", "--levels", type=int, default=5) | |
parser.add_argument("-a", "--axes", type=int, default=len(value_axes_full)) | |
parser.add_argument("-p", "--print-datasets", action="store_true") | |
args = parser.parse_args() | |
if args.axes > len(value_axes_full): | |
raise ValueError(f"Can't have more than {len(value_axes_full)} axes") | |
value_axes = dict(list(value_axes_full.items())[: args.axes]) | |
print(0, end=" ", flush=True) | |
with h5py.File(args.fname, "w") as f: | |
vf = VersionedHDF5File(f) | |
with vf.stage_version("r0") as sv: | |
values = sv.create_group("data").create_group("values") | |
for level in range(args.levels): | |
level_values = values.create_group(str(level)) | |
for value_axis, (gen, dt) in value_axes.items(): | |
rs = np.random.RandomState(0) | |
level_values.create_dataset( | |
value_axis, | |
shape=(0, j, k), | |
maxshape=(None, None, None), | |
dtype=dt, | |
chunks=(10, 1, 1250), | |
) | |
for i in range(1, args.versions): | |
print(i, end=" ", flush=True) | |
with h5py.File(args.fname, "r+") as f: | |
vf = VersionedHDF5File(f) | |
with vf.stage_version(f"r{i}") as sv: | |
for level in range(args.levels): | |
for value_axis, (gen, dt) in value_axes.items(): | |
# reuse data across versions by reusing RandomState | |
rs = np.random.RandomState(i % 4) | |
ds = sv["data/values"][str(level)][value_axis] | |
ds.resize((i, j, k)) | |
a = gen(rs, j, k, dt) | |
ds[-1] = a | |
print() | |
if args.print_datasets: | |
with h5py.File(args.fname, "r") as f: | |
def print_datasets(name, obj): | |
if isinstance(obj, h5py.Dataset): | |
print(name) | |
f.visititems(print_datasets) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment