Created
April 21, 2018 17:27
-
-
Save fedden/852d1670d8f112faba4bd4d4caaad543 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CharRNN(): | |
def __init__(self, | |
vocabulary_size, | |
sequence_length, | |
dropout_rate=0.0, | |
batch_size=32, | |
rnn_size=128, | |
amount_layers=2, | |
embedding_size=32, | |
learning_rate=0.001, | |
clip_norm=5.0): | |
"""Construct the CharRNN. | |
Sets the key member fields and then builds the Keras network. | |
Params: | |
vocabulary_size: int - the number of unique chars in the dictionary. | |
sequence_length: int - the length of the sequences to be input of the RNN. | |
dropout_rate: float - the drop rate of neurons in the network. | |
batch_size: int - the size of the batch of sequences passed to the network. | |
rnn_size: int - the size of the RNN memory. | |
amount_layers: int - the depth of the RNN. | |
embedding_size: int - the size of the embedding vectors at the input of the RNN. | |
learning_rate: float - the size of the gradient updates to the neural network. | |
clip_norm: float - the magnitude the gradient is clipped by the maximum l2 norm. | |
""" | |
# Set the member fields. | |
self.vocabulary_size = vocabulary_size | |
self.embedding_size = embedding_size | |
self.batch_size = batch_size | |
self.dropout_rate = dropout_rate | |
self.amount_layers = amount_layers | |
self.rnn_size = rnn_size | |
self.sequence_length = sequence_length | |
self.learning_rate = learning_rate | |
self.clip_norm = clip_norm | |
# Build the model. | |
self.model = self.build_model() | |
def build_model(self): | |
"""Builds the model. | |
Called in the constructor. | |
Returns: | |
model: Keras Sequential - the RNN used to model the biographies! | |
""" | |
# Model container object. | |
model = Sequential() | |
# Create an embedding layer to map sequences of tokens to sequences of vectors, of | |
# embedding size length. | |
model.add(Embedding(self.vocabulary_size, | |
self.embedding_size, | |
input_length=self.sequence_length)) | |
# Add dropout (if the rate is greater than zero!) | |
model.add(Dropout(self.dropout_rate)) | |
# Stack layers of RNNs. | |
for layer_number in range(self.amount_layers): | |
# If last layer, return last value, not a sequence. | |
many = layer_number != (self.amount_layers - 1) | |
model.add(LSTM(self.rnn_size, return_sequences=many)) | |
model.add(Dropout(self.dropout_rate)) | |
# Add a dense layer to resize the output to be the number of unique chars in the bio dataset. | |
model.add(Dense(self.vocabulary_size, activation="softmax")) | |
# RMS prop is good for RNNs! | |
optimiser = RMSprop(self.learning_rate, clipnorm=self.clip_norm) | |
# This allows us to use tokens as targets with categorical cross entropy. | |
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimiser) | |
return model | |
def inference(self, | |
start_tokens, | |
dataset, | |
inference_length=400, | |
temperature=0.2, | |
include_start_tokens=True): | |
"""Use the trained model. | |
Samples the model with a temperature, the greater the value the more random the output is. | |
The start tokens could either be completely random or sampled from the dataset. | |
Params: | |
start_tokens: list(int) - the tokens to start the model with. | |
dataset: SequenceDataset - the dataset class to convert the tokens to chars. | |
inference_length: int - the length of the generated material. | |
temperature: float - the randomness of the genrated material. | |
Returns: | |
generated: str - string of generated characters. | |
""" | |
# Helper function to sample an index from a probability array (the output layer of the network!) | |
def sample(preds, temperature=1.0): | |
preds = np.asarray(preds).astype('float64') | |
preds = np.log(preds) / temperature | |
exp_preds = np.exp(preds) | |
preds = exp_preds / np.sum(exp_preds) | |
probas = np.random.multinomial(1, preds, 1) | |
return np.argmax(probas) | |
# Sanity check. | |
if len(start_tokens) != self.sequence_length: | |
raise ValueError('Argument start_tokens must be sequence length!') | |
# Don't operate on batches of sequences, just a single one. Convert numpy | |
# arrays to lists. | |
if type(start_tokens) is np.ndarray: | |
if start_tokens.ndim > 1: | |
raise ValueError('Pass in a single sequence, not >=2D') | |
start_tokens = start_tokens.tolist() | |
# String to hold the generated chars. | |
generated = '' | |
# Tokens to be inputted into the model. | |
tokens = start_tokens | |
# Loop and generate the material. | |
for _ in range(inference_length): | |
# Create the inputs in a batch shape. | |
model_inputs = np.array(tokens).reshape((1, self.sequence_length)) | |
# Get the predicted vector based on the inputs. | |
preds = self.model.predict(model_inputs, verbose=0)[0] | |
# Derive next token and then char from predicted vector. | |
next_token = sample(preds, temperature) | |
next_char = dataset.token_to_char[next_token] | |
# Store char and create next input token sequence. | |
generated += next_char | |
tokens = tokens[1:] + [next_token] | |
return generated | |
def save(self, path='model.h5'): | |
self.model.save(path) | |
def load(self, path='model.h5'): | |
self.model = load_model(path) | |
def train(self, dataset, epochs=60, print_inference_progress=False): | |
"""Optimise the model on the dataset. | |
Params: | |
dataset: SequenceDataset - the dataset class to convert the tokens to chars. | |
epochs: int - the number of times the model will see each data point in the test set. | |
print_inference_progress: bool - print the progress of the model at the end of the epoch. | |
""" | |
# Function invoked at end of each epoch. Prints generated text. | |
def on_epoch_end(epoch, logs): | |
print() | |
print('----- Generating text after Epoch: %d' % epoch) | |
for temperature in [0.0, 0.25, 0.5, 0.8]: | |
print('----- temperature:', temperature) | |
results = self.inference(dataset.empty_start_tokens, | |
dataset, | |
inference_length=400, | |
temperature=temperature, | |
include_start_tokens=True) | |
print(results) | |
# Potentially print progress of model during training. | |
callbacks = [LambdaCallback(on_epoch_end=on_epoch_end)] if print_inference_progress else [] | |
# Fit the model on the dataset. | |
self.model.fit(dataset.dataset_x, | |
dataset.dataset_y, | |
batch_size=self.batch_size, | |
epochs=epochs, | |
callbacks=callbacks) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment