Created
June 20, 2022 16:36
-
-
Save rly/2e390a0d7be1c311fbc3ac58164f3a40 to your computer and use it in GitHub Desktop.
Demonstration of how to trim and repack (change compression, chunking, etc.) of a very large HDF5 dataset in an NWB file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pynwb import NWBHDF5IO | |
import pynwb | |
from hdmf.data_utils import GenericDataChunkIterator | |
from hdmf.backends.hdf5.h5_utils import H5DataIO | |
filepath = r"D:\GiocomoData_dandiset53\000053\sub-npI1\sub-npI1_ses-20190413_behavior+ecephys.nwb" | |
class H5DatasetDataChunkIterator(GenericDataChunkIterator): | |
"""A data chunk iterator that reads chunks over the 0th dimension of an HDF5 dataset up to a max length. | |
""" | |
def __init__(self, dataset, max_length, **kwargs): | |
self.dataset = dataset | |
self.max_length = max_length # in the time (0th) dimension | |
super().__init__(**kwargs) | |
def _get_data(self, selection): | |
return self.dataset[selection] | |
def _get_maxshape(self): | |
return (self.max_length, self.dataset.shape[1]) | |
def _get_dtype(self): | |
return self.dataset.dtype | |
with NWBHDF5IO(filepath, "r", load_namespaces=True) as io: | |
nwbfile = io.read() | |
orig_eseries = nwbfile.acquisition["ElectricalSeries"] | |
electrodes = nwbfile.create_electrode_table_region( | |
region=orig_eseries.electrodes.data[:].tolist(), | |
name=orig_eseries.electrodes.name, | |
description=orig_eseries.electrodes.description | |
) | |
num_electrodes = orig_eseries.data.shape[1] | |
max_timestamps = int(2e6) # TODO set this to the maximum number of timestamps to be read | |
# the original dataset is already chunked. for optimal read, read 1 chunk at a time by | |
# setting the read chunk shape to align with the dataset chunk shape | |
assert orig_eseries.data.chunks | |
selection_size_time = orig_eseries.data.chunks[0] | |
# read the electricalseries data iteratively in chunks because it is too big to fit into RAM | |
data_iterator = H5DatasetDataChunkIterator( | |
dataset=nwbfile.acquisition["ElectricalSeries"].data, | |
max_length=max_timestamps, | |
chunk_shape=(selection_size_time, num_electrodes), # this chunk shape is for read | |
buffer_gb=4 # TODO set this to a little under the amount of free RAM available in GB | |
) | |
# create an H5DataIO object which sets HDF5-specific filters and other write options | |
data = H5DataIO( | |
data=data_iterator, | |
compression="gzip", | |
compression_opts=4, | |
chunks=(100, 100), # this chunk shape is for write TODO set this accordingly | |
# TODO pass other options to H5DataIO | |
) | |
# create the new electricalseries with the same parameters as the original electricalseries | |
# except with a different dataset | |
new_eseries = pynwb.ecephys.ElectricalSeries( | |
name=orig_eseries.name, | |
description=orig_eseries.description, | |
data=data, | |
electrodes=electrodes, | |
starting_time=orig_eseries.starting_time, | |
rate=orig_eseries.rate, | |
conversion=orig_eseries.conversion, | |
resolution=orig_eseries.resolution, | |
comments=orig_eseries.comments, | |
) | |
nwbfile.acquisition.pop("ElectricalSeries") # remove the existing electricalseries | |
nwbfile.add_acquisition(new_eseries) # add the newly chunked electricalseries | |
nwbfile.processing.pop("ecephys") # remove the ecephys processing module | |
with pynwb.NWBHDF5IO("dandiset53_trim_iterator.nwb", "w", manager=io.manager) as export_io: | |
export_io.export(io, nwbfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment