Last active
May 11, 2020 06:57
-
-
Save jfsantos/e2ef822c744357a4ed16ec0c885100a3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.utils.io_utils import HDF5Matrix | |
import numpy as np | |
def create_dataset(): | |
import h5py | |
X = np.random.randn(200,10).astype('float32') | |
y = np.random.randint(0, 2, size=(200,1)) | |
f = h5py.File('test.h5', 'w') | |
# Creating dataset to store features | |
X_dset = f.create_dataset('my_data', (200,10), dtype='f') | |
X_dset[:] = X | |
# Creating dataset to store labels | |
y_dset = f.create_dataset('my_labels', (200,1), dtype='i') | |
y_dset[:] = y | |
f.close() | |
create_dataset() | |
# Instantiating HDF5Matrix for the training set, which is a slice of the first 150 elements | |
X_train = HDF5Matrix('test.h5', 'my_data', start=0, end=150) | |
y_train = HDF5Matrix('test.h5', 'my_labels', start=0, end=150) | |
# Likewise for the test set | |
X_test = HDF5Matrix('test.h5', 'my_data', start=150, end=200) | |
y_test = HDF5Matrix('test.h5', 'my_labels', start=150, end=200) | |
# HDF5Matrix behave more or less like Numpy matrices with regards to indexing | |
print(y_train[10]) | |
# But they do not support negative indices, so don't try print(X_train[-1]) | |
model = Sequential() | |
model.add(Dense(64, input_shape=(10,), activation='relu')) | |
model.add(Dense(1, activation='sigmoid')) | |
model.compile(loss='binary_crossentropy', optimizer='sgd') | |
# Note: you have to use shuffle='batch' or False with HDF5Matrix | |
model.fit(X_train, y_train, batch_size=32, shuffle='batch') | |
model.evaluate(X_test, y_test, batch_size=32) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the generator tip @Shawn-Shan. That meant I could actually fit my 200 GB data!
Note that I had to change
y_all = self.hf['y_train'][:]
anddata = hf['y_train'][:]
since it loads all data into memory. It's much more efficient to just use the shape of the data like so:nrows = self.hf["y_train"].shape[0]
and then setself.total_len = nrows
andtrain_len = nrows
.