Created
August 20, 2015 16:17
-
-
Save derricw/24f1c9ef154b12eca8f7 to your computer and use it in GitHub Desktop.
Pack data into hdf5 1GB at a time.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import numpy as np | |
import h5py | |
def get_fileobj_size(file_obj): | |
""" | |
Gets the size of an open file object. | |
Args: | |
file_obj (FileObject): an open file object. | |
Returns: | |
int: size of the file in bytes | |
""" | |
start_pos = file_obj.tell() | |
file_obj.seek(0, 2) | |
size = file_obj.tell() | |
file_obj.seek(start_pos) | |
return size | |
def chunk2hdf5(h5_file, | |
data_file, | |
dtype, | |
data_shape=(-1,), | |
data_name="data", | |
chunk_size=10**9): | |
""" | |
Loads the data into the hdf5 file in chunks. Useful for really big | |
binary files. | |
Args: | |
h5_file (h5py.File or str): An hdf5 file to add the data to. | |
data_file (FileObject or str): A file containing a binary data set | |
dtype (numpy.dtype): intended data type of binary data | |
data_shape (Optional[tuple]): intended shape of binary data | |
data_name (Optional[str]): name of destination dataset | |
chunk_size (Optional[int]): maximum bytes per chunk. Default:10^9 | |
Returns: | |
int: total bytes added to the hdf5 dataset. | |
Raises: | |
IOError: data file doesn't exist | |
NameError: Dataset already has data by that name. | |
IOError: Chunk shape not a multiple of row size. | |
""" | |
# get file objects if they gave us strings | |
if isinstance(h5_file, str): | |
h5_file = h5py.File(h5_file, 'a') | |
if isinstance(data_file, str): | |
data_file = open(data_file, 'rb') | |
data_file.seek(0) | |
if isinstance(data_shape, int): | |
data_shape = (data_shape,) | |
total_bytes = get_fileobj_size(data_file) | |
itemsize = np.dtype(dtype).itemsize | |
row_size = itemsize | |
for l in data_shape[1:]: | |
row_size *= l | |
total_rows = total_bytes / row_size | |
rows_per_chunk = chunk_size // row_size | |
rounded_chunk_size = rows_per_chunk * row_size | |
rounded_chunk_items = rounded_chunk_size / itemsize | |
maxshape = [None] + list(data_shape[1:]) | |
data_shape = [total_rows] + list(data_shape[1:]) | |
# create the dataset if it doesn't exist | |
if data_name not in h5_file.keys(): | |
dset = h5_file.create_dataset(data_name, | |
shape=data_shape, | |
dtype=dtype, | |
maxshape=maxshape) | |
else: | |
raise NameError("Dataset already exists! Choose a new name.") | |
# add the data to the dataset | |
chunk_count = 0 | |
sample_count = 0 | |
reshape = [-1]+ list(data_shape[1:]) | |
while True: | |
#read a chunk | |
chunk = np.fromfile(data_file, | |
dtype=dtype, | |
count=rounded_chunk_items, | |
) | |
try: | |
chunk = chunk.reshape(reshape) | |
except ValueError: | |
raise IOError("Chunk shape doesn't match desired shape.") | |
# if chunk has rows, write it. if not break | |
if chunk.shape[0] > 0: | |
start = chunk_count * rows_per_chunk | |
stop = start + chunk.shape[0] | |
dset[start:stop] = chunk | |
chunk_count+=1 | |
sample_count+=chunk.size | |
else: | |
logging.info("chunk2hdf5 Finished!") | |
break | |
logging.info(" - moved {} samples in {} chunks.".format(sample_count, | |
chunk_count)) | |
return sample_count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment