-
-
Save jfsantos/e2ef822c744357a4ed16ec0c885100a3 to your computer and use it in GitHub Desktop.
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.utils.io_utils import HDF5Matrix | |
import numpy as np | |
def create_dataset(): | |
import h5py | |
X = np.random.randn(200,10).astype('float32') | |
y = np.random.randint(0, 2, size=(200,1)) | |
f = h5py.File('test.h5', 'w') | |
# Creating dataset to store features | |
X_dset = f.create_dataset('my_data', (200,10), dtype='f') | |
X_dset[:] = X | |
# Creating dataset to store labels | |
y_dset = f.create_dataset('my_labels', (200,1), dtype='i') | |
y_dset[:] = y | |
f.close() | |
create_dataset() | |
# Instantiating HDF5Matrix for the training set, which is a slice of the first 150 elements | |
X_train = HDF5Matrix('test.h5', 'my_data', start=0, end=150) | |
y_train = HDF5Matrix('test.h5', 'my_labels', start=0, end=150) | |
# Likewise for the test set | |
X_test = HDF5Matrix('test.h5', 'my_data', start=150, end=200) | |
y_test = HDF5Matrix('test.h5', 'my_labels', start=150, end=200) | |
# HDF5Matrix behave more or less like Numpy matrices with regards to indexing | |
print(y_train[10]) | |
# But they do not support negative indices, so don't try print(X_train[-1]) | |
model = Sequential() | |
model.add(Dense(64, input_shape=(10,), activation='relu')) | |
model.add(Dense(1, activation='sigmoid')) | |
model.compile(loss='binary_crossentropy', optimizer='sgd') | |
# Note: you have to use shuffle='batch' or False with HDF5Matrix | |
model.fit(X_train, y_train, batch_size=32, shuffle='batch') | |
model.evaluate(X_test, y_test, batch_size=32) |
Works well until you create HDF5 using Pandas
HDF5Matrix is much slower when I read data batches by batches, or use a for loop. Here is a quick modification:
file_name = "data.h5"
class DataGenerator(Sequence):
def __init__(self, file_name, batch_size=1024, data_split=100):
self.hf = h5py.File(file_name, 'r')
y_all = self.hf['y_train'][:]
self.total_len = len(y_all)
self.batch_size = batch_size
self.idx = 0
self.len_segment = int(self.total_len / data_split)
self.cur_seg_idx = 0
self.x_cur = self.hf['x_train'][:self.len_segment]
self.y_cur = self.hf['y_train'][:self.len_segment]
def next_seg(self):
self.cur_seg_idx += self.len_segment
self.x_cur = self.hf['x_train'][self.cur_seg_idx:self.cur_seg_idx+self.len_segment]
self.y_cur = self.hf['y_train'][self.cur_seg_idx:self.cur_seg_idx+self.len_segment]
def generate(self):
while 1:
idx = self.idx
if idx >= self.len_segment:
self.next_seg()
idx = 0
if idx + self.batch_size >= self.len_segment:
batch_x = self.x_cur[idx:]
batch_y = self.y_cur[idx:]
else:
batch_x = self.x_cur[idx:(idx + self.batch_size)]
batch_y = self.y_cur[idx:(idx + self.batch_size)]
self.idx = idx + self.batch_size
yield batch_x, batch_y
with h5py.File('data.h5', 'r') as hf:
data = hf['y_train'][:]
train_len = len(data)
batch_size = 1024
x_len = int(train_len / batch_size)
training_generator = DataGenerator(file_name, batch_size=batch_size).generate()
model.fit_generator(generator=training_generator,
epochs=1,
steps_per_epoch=x_len, workers=1,
use_multiprocessing=False,
verbose=1)
It uses a generator, and basically split the large dataset that couldn't fit into memory as a whole, and split into 100 segments, and generate on each segment.
@Shawn-Shan, thanks a lot!
@Shawn-Shan, can we use it with multiple workers?
@Shawn-Shan, can we use it with multiple workers?
I think it should not be used with multiple workers.
@Shawn-Shan
Thx, for your solution!
Reading from HDF5 is extremely slow.
Before I adopt your solution, it is like 200s per epoch for my training.
After I use your cache solution, it is like 17s per epoch.
And for my use case (I use the Sequence interface), I need to set Shuffle=False explicitly.
Thanks for the generator tip @Shawn-Shan. That meant I could actually fit my 200 GB data!
Note that I had to change y_all = self.hf['y_train'][:]
and data = hf['y_train'][:]
since it loads all data into memory. It's much more efficient to just use the shape of the data like so: nrows = self.hf["y_train"].shape[0]
and then set self.total_len = nrows
and train_len = nrows
.
@lamenramen I got the same error. Did you ever figure it out?