goldsborough · February 21, 2018 19:28
diff --git a/rnn_tester.py b/rnn_tester.py
 import argparse
 import collections
 import datetime
 import math
 import numpy as np
 import os
 import sys
 import tensorflow as tf
 import time
 import warnings

 import matplotlib.pyplot as plot
 import seaborn

 tf.set_random_seed(sum(map(ord, 'chicken')))
 warnings.filterwarnings('ignore')
 seaborn.set_style('dark')


 Data = collections.namedtuple('Data', 'samples, number_of_streams, names')
 Graph = collections.namedtuple('Graph', ['x',
                                         'y',
                                         'initial_state',
                                         'final_state',
                                         'loss',
                                         'predictions',
                                         'accuracy',
                                         'summaries',
                                         'learning_rate',
                                         'dropout',
                                         'train'])

 DATA_STREAMS = dict(sine=lambda t: np.sin(1/20 * t),
                    sine1=lambda t: np.sin(1/20 * t - 1),
                    sine2=lambda t: np.sin(1/20 * t - 2),
                    sine3=lambda t: np.sin(1/20 * t - 3),
                    sine4=lambda t: np.sin(1/20 * t - 4),
                    sine5=lambda t: np.sin(1/20 * t - 5),
                    cosine=lambda t: np.cos(1/20 * t),
                    linear=lambda t: 1/100 * t,
                    sine_up=lambda t: 2 * np.sin(1/20 * t) + t/10000,
                    sawtooth=lambda t: (t/100 - np.floor(t/100)) - 0.5,
                    mod=lambda t: (t/100 % 1),
                    sinsq=lambda t: np.sin((t/10)**2),
                    delta10=lambda t: 1 if t % 10 == 0 else 0,
                    delta10_1=lambda t: 1 if t % 10 == 1 else 0,
                    delta10_2=lambda t: 1 if t % 10 == 2 else 0,
                    delta10_3=lambda t: 1 if t % 10 == 3 else 0,
                    delta10_4=lambda t: 1 if t % 10 == 4 else 0,
                    delta100=lambda t: 1 if t % 100 == 0 else 0)


 INITIALIZERS = dict(orthogonal=tf.orthogonal_initializer(),
                    random_uniform=tf.random_uniform_initializer(),
                    random_normal=tf.random_normal_initializer(),
                    truncated_normal=tf.truncated_normal_initializer(),
                    glorot=None)

 REGULARIZERS = dict(l1=tf.contrib.layers.l1_regularizer(0.1),
                    l2=tf.contrib.layers.l2_regularizer(0.1))

 def dense(inputs,
          output_size,
          activation=None,
          initializer=None,
          regularizer=None):
    weights = tf.get_variable('W',
                              [inputs.shape[1], output_size],
                              initializer=initializer,
                              regularizer=regularizer)
    bias = tf.get_variable('b',
                           [output_size],
                           initializer=tf.constant_initializer(0.1))

    output = tf.matmul(inputs, weights) + bias
    if activation is not None:
        output = activation(output)

    return output


 def feedforward(inputs,
                output_size,
                number_of_layers,
                units_per_layer,
                hidden_activation,
                final_activation=None,
                initializer=None,
                regularizer=None):
    if number_of_layers == 0:
        return inputs

    tensors = inputs
    for layer in range(1, number_of_layers):
        with tf.variable_scope('hidden-{0}'.format(layer)):
            tensors = dense(tensors,
                            units_per_layer,
                            hidden_activation,
                            initializer,
                            regularizer)

    with tf.variable_scope('final'):
        output = dense(tensors,
                       output_size,
                       final_activation,
                       initializer,
                       regularizer)

    return output


 def leaky_relu(x, alpha=0.05):
    return tf.maximum(alpha * x, x)


 class ScrappyCell(tf.contrib.rnn.RNNCell):
  def __init__(self,
               state_size,
               hidden_units,
               hidden_layers,
               hidden_activation=leaky_relu,
               final_activation=tf.tanh,
               reuse=None):
    self._state_size = state_size
    self._reuse = reuse
    self._hidden_units = hidden_units
    self._hidden_layers = hidden_layers
    self._hidden_activation = hidden_activation
    self._final_activation = final_activation

    status = ('Initializing Scrappy cell with {0} units in {1} layers '
              'with {2} as hidden activation and {3} as final activation')
    print(status.format(hidden_units,
                        hidden_layers,
                        hidden_activation.__name__,
                        final_activation.__name__))

  @property
  def state_size(self):
    return self._state_size

  @property
  def output_size(self):
    return self._state_size

  def __call__(self, inputs, state, scope=None):
    with tf.variable_scope(scope or type(self).__name__, reuse=self._reuse):
        concatenated = tf.concat([inputs, state], axis=1)
        state = dense(concatenated, self.output_size, self._final_activation)
        output = feedforward(inputs=state,
                             output_size=self.output_size,
                             number_of_layers=self._hidden_layers,
                             units_per_layer=self._hidden_units,
                             hidden_activation=self._hidden_activation,
                             final_activation=self._final_activation)

        return output, output


 class RNNCell(tf.contrib.rnn.RNNCell):
  def __init__(self, state_size, reuse=None):
    self._state_size = state_size
    self._reuse = reuse

  @property
  def state_size(self):
    return self._state_size

  @property
  def output_size(self):
    return self._state_size

  def __call__(self, inputs, state, scope=None):
    with tf.variable_scope(scope or type(self).__name__, reuse=self._reuse):
        input_size = inputs.shape[1] + self._state_size

        weights = tf.get_variable('W', [input_size, self._state_size])
        bias = tf.get_variable('b', [self._state_size])

        concatenated = tf.concat([inputs, state], axis=1)
        output = tf.matmul(concatenated, weights) + bias

        return output, output


 def layer_normalize(tensor,
                    scope=None,
                    epsilon=1e-5,
                    initial_gain=1.0,
                    initial_bias=0.0):
    assert len(tensor.shape) == 2
    input_size = tensor.shape[1]

    with tf.variable_scope('{0}/layer_norm'.format(scope)):
        gain_initializer = tf.constant_initializer(initial_gain)
        gain = tf.get_variable('gain',
                               shape=[input_size],
                               initializer=gain_initializer)

        bias_initializer = tf.constant_initializer(initial_bias)
        bias = tf.get_variable('bias',
                               shape=[input_size],
                               initializer=bias_initializer)

    mean, variance = tf.nn.moments(tensor, axes=[1], keep_dims=True)

    z_shift = (tensor - mean) / tf.sqrt(variance + epsilon)

    return gain * z_shift + bias



 class LSTMCell(tf.contrib.rnn.RNNCell):
    def __init__(self,
                 state_size,
                 forget_bias=1.0,
                 activation=tf.tanh,
                 final_activation=tf.tanh,
                 layer_normalize=False,
                 reuse=None):
        self._state_size = state_size
        self._forget_bias = forget_bias
        self._activation = activation or (lambda z: z)
        self._final_activation = final_activation or (lambda z: z)
        self._reuse = reuse
        self._layer_normalize = layer_normalize

        if layer_normalize:
            print('Using layer normalization')

    @property
    def state_size(self):
        return tf.contrib.rnn.LSTMStateTuple(self._state_size, self._state_size)

    @property
    def output_size(self):
        return self._state_size

    def perform_layer_normalization(self, values):
        scopes = ['input_gate', 'output_gate', 'forget_gate', 'input']
        return [layer_normalize(i, scopes[n]) for n, i in enumerate(values)]

    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__, reuse=self._reuse):
            cell_state, hidden_state = state
            concatenated = tf.concat([inputs, hidden_state], axis=1)

            values = dense(concatenated, 4 * self._state_size)
            values = tf.split(values, 4, axis=1)

            if self._layer_normalize:
                values = self.perform_layer_normalization(values)

            input_gate = tf.sigmoid(values[0], 'i')
            output_gate = tf.sigmoid(values[1], 'o')
            forget_gate = tf.sigmoid(values[2] + self._forget_bias, 'f')
            intermediate_input = self._activation(values[3])

            cell_state *= forget_gate
            cell_state += input_gate * intermediate_input

            output = output_gate * self._activation(cell_state)

            new_state = tf.contrib.rnn.LSTMStateTuple(cell_state, output)

            return output, new_state


 def generate_single_epoch(data, sequence_length, number_of_batches):
    batch_length = len(data) // number_of_batches
    batches = []
    for batch_index in range(number_of_batches):
        start = batch_length * batch_index
        end = batch_length * (batch_index + 1)
        batches.append(data[start:end])
    epoch_size = (batch_length - 1) // sequence_length
    assert epoch_size > 0

    batches = np.array(batches, dtype=np.float32)
    for i in range(epoch_size):
        x = batches[:, i * sequence_length: (i + 1) * sequence_length]
        y = batches[:, i * sequence_length + 1: (i + 1) * sequence_length + 1]
        yield (x, y)


 def generate_epochs(data, arguments):
    if arguments.training_points:
        data = data[:arguments.training_points]
    for _ in range(arguments.epochs):
        yield generate_single_epoch(data,
                                    arguments.sequence_length,
                                    arguments.batch_size)


 def final_feedforward(y, hyper, number_of_streams):
    with tf.variable_scope('final_feedforward'):
        y = tf.reshape(y, [-1, int(y.shape[-1])])
        predictions = feedforward(inputs=y,
                                  output_size=number_of_streams,
                                  number_of_layers=hyper.final_layers,
                                  units_per_layer=hyper.final_units,
                                  hidden_activation=leaky_relu,
                                  initializer=hyper.initializer,
                                  regularizer=hyper.regularizer)
        return predictions


 def make_rnn(hyper, dropout):
    status = 'Using {0} cells with a state size of {1}'
    print(status.format(hyper.model, hyper.state_size))

    assert hyper.model in ['lstm', 'rnn', 'gru', 'scrappy']

    if hyper.model == 'lstm':
        # if hyper.layer_normalize:
        cell = LSTMCell(hyper.state_size,
                        layer_normalize=hyper.layer_normalize)
        # # else:
        # if hyper.use_peepholes:
        #     print('Using peephole connections in LSTM cells')
        # cell = tf.contrib.rnn.LSTMCell(hyper.state_size,
        #                                use_peepholes=hyper.use_peepholes,
        #                                state_is_tuple=True,
        #                                activation=lambda x: x)
    elif hyper.model == 'rnn':
        cell = RNNCell(hyper.state_size)
        # cell = tf.contrib.rnn.BasicRNNCell(hyper.state_size)
    elif hyper.model == 'gru':
        cell = tf.contrib.rnn.GRUCell(hyper.state_size)
    elif hyper.model == 'scrappy':
        cell = ScrappyCell(hyper.state_size,
                           hyper.hidden_units,
                           hyper.hidden_layers)

    if hyper.dropout < 1:
        print('Using dropout with a rate of {0:.3f}'.format(1 - hyper.dropout))
    cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=dropout)

    layers = [cell] * hyper.model_layers
    rnn = tf.contrib.rnn.MultiRNNCell(layers)
    rnn = tf.contrib.rnn.DropoutWrapper(rnn, output_keep_prob=dropout)

    return rnn


 def configure_learning_rate(hyper, global_step):
    learning_rate = tf.constant(hyper.learning_rate)
    if hyper.decay is not None:
        input_size = hyper.batch_size * hyper.sequence_length
        decay_steps = math.ceil(hyper.data_points / input_size)
        status = 'Decaying learning rate from {0} by {1} every {2} steps'
        print(status.format(hyper.learning_rate, hyper.decay, decay_steps))
        learning_rate = tf.train.exponential_decay(learning_rate,
                                                   decay_steps=decay_steps,
                                                   decay_rate=hyper.decay,
                                                   global_step=global_step,
                                                   staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)

    return learning_rate


 def convolve(x, hyper):
    if not hyper.convolution_filters:
        return x

    message = ('Convolving streams with filter size = {0}, '
               'stride = {1}, filters = {2}, padding = {3}')
    print(message.format(hyper.convolution_size,
                         hyper.convolution_stride,
                         hyper.convolution_filters,
                         hyper.convolution_padding))

    with tf.variable_scope('convolution'):
        shape = (hyper.convolution_size,    # The width of the filter
                 int(x.shape[-1]),  # The number of input channels to convolve
                 hyper.convolution_filters)  # The number of output channels
        filters = tf.get_variable('filters',
                                  initializer=hyper.initializer,
                                  regularizer=hyper.regularizer,
                                  shape=shape)

        biases = tf.get_variable('biases',
                                 shape=[hyper.convolution_filters],
                                 initializer=tf.constant_initializer(0.1))

        tf.summary.histogram('filters', filters)
        tf.summary.histogram('biases', biases)

    convolved = tf.nn.conv1d(value=x,
                             filters=filters,
                             stride=hyper.convolution_stride,
                             padding=hyper.convolution_padding.upper())

    result = tf.nn.relu(convolved + biases)

    assert int(result.shape[-1]) == hyper.convolution_filters

    return result


 def build_graph(hyper, number_of_streams):
    start = time.time()
    tf.reset_default_graph()

    x_input = tf.placeholder(tf.float32, [None, None, number_of_streams])
    y_hat = tf.placeholder(tf.float32, [None, None, number_of_streams])

    x = convolve(x_input, hyper)

    # offset = x[0, 0]
    # x %= 1

    dropout = tf.Variable(hyper.dropout, trainable=False)
    rnn = make_rnn(hyper, dropout)

    batch_size = tf.shape(x_input)[0]
    initial_state = rnn.zero_state(batch_size, dtype=tf.float32)
    y, final_state = tf.nn.dynamic_rnn(rnn, x, initial_state=initial_state)

    predictions = final_feedforward(y, hyper, number_of_streams)

    # sequence_length = tf.shape(x_input)[1]
    # unmod = offset + (tf.slice(np.arange(hyper.sequence_length, dtype=np.float32), [0], [sequence_length]) / 100)
    # predictions += tf.reshape(unmod, tf.shape(predictions))

    labels = tf.reshape(y_hat, [-1, number_of_streams])
    positives = tf.abs(predictions - labels) < hyper.accuracy_epsilon
    accuracy = tf.reduce_mean(tf.cast(positives, dtype=tf.float32))
    total_loss = tf.reduce_mean(tf.square(predictions - labels))

    tf.summary.scalar('accuracy', accuracy)
    tf.summary.scalar('loss', total_loss)

    global_step = tf.Variable(0, trainable=False)
    learning_rate = configure_learning_rate(hyper, global_step)

    optimizer = tf.train.AdamOptimizer(learning_rate)
    train = optimizer.minimize(total_loss, global_step=global_step)

    summaries = tf.summary.merge_all()

    print('Compiled graph in: {0:.3f}s'.format(time.time() - start))
    return Graph(x_input,
                 y_hat,
                 initial_state,
                 final_state,
                 total_loss,
                 predictions,
                 accuracy,
                 summaries,
                 learning_rate,
                 dropout,
                 train)


 def prediction_file(arguments, index):
    if arguments.dry:
        return open('/tmp/ignore', 'w')
    file_name = 'epoch-{0}'.format(index)
    path = os.path.join(arguments.workspace, 'predictions', file_name)
    return open(path, 'w')


 def plot_predictions(arguments, predictions, data):
        if not arguments.plot and not arguments.plot_all:
            return

        real = list(zip(*data.samples))
        matrix = zip(list(zip(*predictions)), real, data.names)
        predict_start = arguments.prediction_start + arguments.predict_ahead
        predict_end = predict_start + len(predictions)
        predict_range = list(range(predict_start, predict_end))

        for index, (prediction, real, name) in enumerate(matrix, start=1):
            print("Plotting result for '{0}'...".format(name))
            handles = []
            plot.figure(index)
            plot.title(arguments.workspace)

            plot.subplot(111)
            label = '{0}-predicted'.format(name)
            handles += plot.plot(predict_range, prediction, 'r--', label=label)

            plot.subplot(111)
            handles += plot.plot(real, 'g-', label='{0}-real'.format(name))

            if arguments.training_points:
                plot.axvline(arguments.training_points, linestyle='-.')
            if arguments.prediction_start:
                plot.axvline(arguments.prediction_start, linestyle=':')

            plot.legend(loc='upper center',
                        handles=handles,
                        bbox_to_anchor=(0.95, 1),
                        fontsize='large')

        plot.show()


 def process_prediction(prediction, data, index, mean_error, output):
    data = data.samples
    zero = np.zeros_like(data[0])
    error = np.abs(prediction - data[index]) if index < len(data) else zero
    mean_error += (error - mean_error) / index
    status = '{0} {1} {2} {3}\n'
    label = data[index] if index < len(data) else zero
    error_string = ', '.join(map(str, mean_error))
    prediction = ', '.join(map(str, prediction))
    label = ', '.join(map(str, label))
    status = status.format(index, prediction, label, error_string)

    if output.name != '/tmp/ignore':
        output.write(status)

    return mean_error


 def train_online(graph, experience):
    print('Training online (batch of {0}) ...'.format(len(experience)))
    session = tf.get_default_session()
    x, y = experience[:-1], experience[1:]
    feed_dict = {graph.x: [x], graph.y: [y]}
    session.run(graph.train, feed_dict)
    experience.clear()


 def make_example(arguments, data, sequence_length):
    shape = (arguments.batch_size, sequence_length, data.number_of_streams)
    return np.zeros(shape=shape, dtype=np.float32)


 def roll_forward(graph, arguments, data, offset):
    # print('\nRolling {0} steps forward'.format(arguments.predict_ahead - 1))
    example = make_example(arguments, data, arguments.predict_ahead - 1)
    example[0] = data.samples[offset:offset + arguments.predict_ahead - 1]

    session = tf.get_default_session()
    feed_dict = {graph.x: example, graph.dropout: 1.0}
    fetches = [graph.predictions, graph.final_state]
    prediction, state = session.run(fetches, feed_dict)

    return state, [prediction[-1:]]


 def generate_samples(graph, arguments, data, epoch_index):
    offset = arguments.prediction_start
    session = tf.get_default_session()
    state = None

    if arguments.predict_ahead > 0:
        state, generated = roll_forward(graph, arguments, data, offset)
    else:
        generated = [data.samples[offset]]

    example = make_example(arguments, data, 1)
    error = 0
    experience = []

    with prediction_file(arguments, epoch_index) as output:
        for index in range(1 + offset, 1 + offset + arguments.predict):
            if arguments.train_online and index < len(data.samples):
                experience.append(data.samples[index])
                if len(experience) == arguments.online_batch_size:
                    train_online(graph, experience)

            index += arguments.predict_ahead

            if arguments.help_prediction and index < len(data.samples):
                example[0, 0] = data.samples[index]
            else:
                example[0, 0] = generated[-1]

            feed_dict = {graph.x: example, graph.dropout: 1.0}
            if state is not None:
                feed_dict[graph.initial_state] = state

            fetches = [graph.predictions, graph.final_state]
            prediction, state = session.run(fetches, feed_dict)
            prediction = prediction[0]

            assert len(prediction) == data.number_of_streams

            generated.append(prediction)
            error = process_prediction(prediction, data, index, error, output)

    error_string = ', '.join(
        ['{0} = {1:.5f}'.format(k, v) for k, v in zip(data.names, error)])

    print(' | Prediction Error: {0}'.format(error_string))
    if epoch_index == 'final':
        plot_predictions(arguments, generated, data)

    return error


 def training_step(graph, x, y, state, write_summary):
    session = tf.get_default_session()
    feed_dict = {graph.x: x, graph.y: y}
    if state is not None:
        feed_dict[graph.initial_state] = state
    fetches = [graph.loss,
               graph.accuracy,
               graph.final_state,
               graph.summaries,
               graph.learning_rate,
               graph.train]
    values = session.run(fetches, feed_dict)
    loss, accuracy, state, summaries, learning_rate = values[:-1]

    write_summary(summaries)

    return loss, accuracy, state, learning_rate


 def make_summary_writer(arguments, session):
    if not arguments.tensorboard or arguments.dry:
        return lambda _: _

    path = os.path.join(arguments.workspace, 'tensorboard')
    summary_writer = tf.summary.FileWriter(path, graph=session.graph)
    global_step = 0

    print('Writing TensorBoard logs to {0}'.format(path))

    def write_summary(summary):
        nonlocal global_step
        summary_writer.add_summary(summary, global_step=global_step)
        global_step += 1

    return write_summary


 def print_status(variables):
    status = ('\r Epoch: {:<4} | Iteration: {:<4} | '
              'Loss: {:.10f} | Accuracy: {:.8f} | '
              'Learning Rate: {:.8f}')
    status = status.format(variables['epoch_index'],
                           variables['iteration'],
                           variables['loss'],
                           variables['accuracy'],
                           variables['learning_rate'])
    print(status, end='')


 def _train_graph(data, graph, arguments, save):
    losses, accuracies, errors = [], [], []
    with tf.Session() as session:
        tf.global_variables_initializer().run()
        write_summary = make_summary_writer(arguments, session)
        epoch_generator = generate_epochs(data.samples, arguments)
        for epoch_index, epoch in enumerate(epoch_generator, start=1):
            do_checkpoint = arguments.checkpoint and epoch_index > 1
            if do_checkpoint and epoch_index % arguments.checkpoint == 0:
                error = generate_samples(graph, arguments, data, epoch_index)
                errors.append(error)
            stop = False
            state = None
            for iteration, (x, y) in enumerate(epoch, start=1):
                try:
                    values = training_step(graph, x, y, state, write_summary)
                    loss, accuracy, state, learning_rate = values
                    print_status(locals())
                    losses.append(loss)
                    accuracies.append(accuracy)
                except KeyboardInterrupt:
                    if stop: break
                    stop = True
            save(session)
            if stop: break
        error = generate_samples(graph, arguments, data, 'final')
        errors.append(error)

    return losses, accuracies, errors


 def make_checkpoint(arguments):
    file_name = 'checkpoint-{0}'.format(time.strftime('%H-%M-%S'))
    checkpoint = os.path.join(arguments.workspace, 'checkpoints', file_name)
    print("Checkpoint is: '{0}'".format(checkpoint))

    return checkpoint


 def train_graph(graph, data, arguments):
    start = time.time()

    saver = tf.train.Saver()
    checkpoint = make_checkpoint(arguments)
    if arguments.dry:
        save = lambda _: _
    else:
        save = lambda session: saver.save(session, checkpoint)
    metrics = _train_graph(data, graph, arguments, save)

    print('\nTraining time: {0:.3f} s'.format(time.time() - start))

    return metrics


 def parse_arguments(command_line_arguments):
    parser = argparse.ArgumentParser(description='RNN benchmark tool')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=1000,
                        help='Number of epochs to run')
    parser.add_argument('-l',
                        '--sequence-length',
                        type=int,
                        default=100,
                        help='Length of the sequence in each batch')
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=1,
                        help='The batch size to vectorize training')
    parser.add_argument('-s',
                        '--state-size',
                        type=int,
                        default=128,
                        help='The dimension of the state')
    parser.add_argument('-o',
                        '--model-layers',
                        type=int,
                        default=1,
                        help='The number of (overall) layers of the model')
    parser.add_argument('--hidden-layers',
                        type=int,
                        default=1,
                        help='The number of hidden layers inside a '
                             '(deep) RNN or scrappy cell')
    parser.add_argument('--hidden-units',
                        type=int,
                        default=128,
                        help='The number of hidden units inside a '
                             '(deep) RNN or scrappy cell')
    parser.add_argument('--final-layers',
                        type=int,
                        default=1,
                        help='The number of hidden layers used to process the '
                             'output of the model, in each layer')
    parser.add_argument('--final-units',
                        type=int,
                        default=128,
                        help='The number of hidden units used to process the '
                             'output of the model, in each layer')
    parser.add_argument('--training-points',
                        type=int,
                        default=0,
                        help='The amount of data to train on')
    parser.add_argument('-r',
                        '--learning-rate',
                        type=float,
                        default=1e-4,
                        help='The learning rate to use')
    parser.add_argument('--decay',
                        type=float,
                        help='The learning rate decay rate per epoch')
    parser.add_argument('--accuracy-epsilon',
                        type=float,
                        default=0.01,
                        help='The epsilon within we allow when '
                             'counting a true positive for a prediction')
    parser.add_argument('-c',
                        '--checkpoint',
                        type=int,
                        default=100,
                        help='How often to generate samples. If not '
                             'supplied, generates only at the end.')
    parser.add_argument('--workspace',
                        default=None,
                        help='The name of the workspace for the run.'
                             'If not supplied, an appropriate one will '
                             'be generated')
    parser.add_argument('-d',
                        '--data-points',
                        type=int,
                        help='How much data to generate as input')
    parser.add_argument('--prediction-start',
                        type=int,
                        default=0,
                        help='At what point in the data to start predicting')
    parser.add_argument('-p',
                        '--predict',
                        type=int,
                        default=10000,
                        help='How many data points to predict '
                             'on every checkpoint')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.0,
                        help='The dropout probability')
    parser.add_argument('--layer-normalize',
                        action='store_true',
                        help='Whether to use layer normalization')
    parser.add_argument('--initializer',
                        choices=INITIALIZERS.keys(),
                        default='orthgonal',
                        help='The initializer to use for weights')
    parser.add_argument('--regularizer',
                        choices=REGULARIZERS.keys(),
                        help='The regularizer to use for weights')
    parser.add_argument('--use-peepholes',
                        action='store_true',
                        help='Whether to use peepholes')

    parser.add_argument('--convolution-filters',
                        type=int,
                        help='If set, will apply 1-D convolutions to the '
                             'signal with the given number of filters.')
    parser.add_argument('--convolution-size',
                        type=int,
                        default=3,
                        help='If --convolution-filters is set, uses this '
                             'filter size for the convolution')
    parser.add_argument('--convolution-stride',
                        type=int,
                        default=1,
                        help='If --convolution-filters is set, uses this '
                             'stride for the convolution')
    parser.add_argument('--convolution-padding',
                        type=str,
                        choices=('valid', 'same'),
                        default='same',
                        help='If --convolution-filters is set, uses this '
                             'padding technique for the convolution')
    parser.add_argument('--predict-ahead',
                        type=int,
                        default=0,
                        help='The offset into the future for prediction')
    parser.add_argument('--train-online',
                        action='store_true',
                        help='Whether to do online training during prediction')
    parser.add_argument('--online-batch-size',
                        type=int,
                        help='How many samples to collect online before '
                             'performing a single training step')
    parser.add_argument('--z-shift',
                        action='store_true',
                        help='Whether to z-shift the data')
    parser.add_argument('--help-prediction',
                        action='store_true',
                        help='Whether to feed in ground truth '
                             'instead of predictions during prediction')

    parser.add_argument('--plot',
                        action='store_true',
                        help='Wether to plot the predictions')
    parser.add_argument('--plot-all',
                        action='store_true',
                        help='Wether to plot everything')
    parser.add_argument('--tensorboard',
                        action='store_true',
                        help='Wether to enable TensorBoard')
    parser.add_argument('--dry',
                        action='store_true',
                        help='Dry run (does not store anything to disk)')
    parser.add_argument('--from-files',
                        action='store_true',
                        help='Assume the streams are files to load, not '
                             'names of streams to generate internally')

    parser.add_argument('model',
                        choices=['lstm', 'gru', 'rnn', 'scrappy'],
                        help='The kind of model to run')
    parser.add_argument('streams',
                        nargs='+',
                        help='The kind of streams to generate')

    arguments = parser.parse_args(command_line_arguments)

    if not arguments.from_files:
        if not all(s in DATA_STREAMS.keys() for s in arguments.streams):
            streams = ', '.join(DATA_STREAMS.keys())
            raise KeyError('Streams must be one of {{{}}}'.format(streams))

    if arguments.workspace is None:
        parts = [arguments.model,
                 '-'.join(arguments.streams),
                 arguments.model_layers,
                 arguments.state_size,
                 arguments.hidden_layers,
                 arguments.hidden_units,
                 arguments.final_layers,
                 arguments.final_units,
                 datetime.datetime.now().isoformat()]
        arguments.workspace = '-'.join(map(str, parts))

    arguments.dropout = 1.0 - arguments.dropout

    if arguments.initializer:
        print('Using {0} initialization'.format(arguments.initializer))
        arguments.initializer = INITIALIZERS.get(arguments.initializer)
    if arguments.regularizer:
        print('Using {0} regularization'.format(arguments.regularizer))
        arguments.regularizer = REGULARIZERS[arguments.regularizer]

    if arguments.train_online and not arguments.online_batch_size:
        message = 'Setting online batch size to sequence length ({0})'
        print(message.format(arguments.sequence_length))
        arguments.online_batch_size = arguments.sequence_length

    return arguments


 def generate_data(streams, data_points):
    assert data_points is not None
    message = "Generating {0} data points each with streams: {1}"
    print(message.format(data_points, ', '.join(streams)))
    data = []
    for t in range(data_points):
        data.append([DATA_STREAMS[stream](t) for stream in streams])

    return data


 def read_streams_from_disk(streams, data_points):
    data = []
    for stream in streams:
        with open(stream, 'r') as source:
            stream_data = []
            for count, sample in enumerate(source):
                if count == data_points:
                    break
                if sample and not sample.isspace():
                    stream_data.append(float(sample))
            message = "Loaded {0} data points from file '{1}'"
            print(message.format(len(stream_data), stream))
            data.append(stream_data)

    return list(zip(*data))


 def z_shift(data):
    data = np.array(data)
    mean = np.mean(data, axis=0)
    print('Data has mean {0}'.format(', '.join(map(str, mean))))

    variance = np.mean(np.square(data), axis=0) - np.square(mean)
    stddev = np.sqrt(variance)
    print('Data has stddev {0}'.format(', '.join(map(str, stddev))))

    z = (data - mean) / stddev

    assert np.all(np.mean(z, axis=0) < 1e-9)

    return z.tolist()


 def load_data(arguments):
    streams, data_points = arguments.streams, arguments.data_points
    if arguments.from_files:
        data = read_streams_from_disk(streams, data_points)
    else:
        data = generate_data(streams, data_points)

    if arguments.z_shift:
        print('Z-shifting data')
        data = z_shift(data)

    if data_points is None:
        arguments.data_points = len(data)

    return Data(data, number_of_streams=len(streams), names=streams)


 def setup_workspace(arguments):
    if arguments.dry:
        return
    inner_most = os.path.join(arguments.workspace, 'checkpoints')
    os.makedirs(inner_most, exist_ok=True)
    inner_most = os.path.join(arguments.workspace, 'predictions')
    os.makedirs(inner_most, exist_ok=True)
    print('Created directory {0}'.format(arguments.workspace))


 def write_metrics(arguments, loss, accuracy):
    if arguments.dry:
        return
    path = os.path.join(arguments.workspace, 'metrics.csv')
    print('Writing metrics to {0} ...'.format(path))
    with open(path, 'w') as output:
        for l, a in zip(loss, accuracy):
            output.write('{0} {1}\n'.format(l, a))


 def save_results(arguments, metrics):
    loss, accuracy, error = metrics
    if not arguments.dry:
        write_metrics(arguments, loss, accuracy)
    if not arguments.plot_all:
        return

    plot.figure(1)
    plot.subplot(111)
    loss_plot, = plot.plot(loss, 'r-', label='loss')

    plot.subplot(111)
    accuracy_plot, = plot.plot(accuracy, 'b-', label='accuracy')

    plot.title(arguments.workspace + ' (loss/accuracy)')
    plot.legend(loc='upper center',
                handles=[loss_plot, accuracy_plot],
                bbox_to_anchor=(0.95, 1),
                fontsize='x-large')

    plot.figure(2)
    plot.title('Mean error over time')
    plot.plot(error, 'r-')

    plot.show()

 def main():
    '''
    python rnn_tester.py --learning-rate=0.01 --decay=0.99 --epochs=1000 --data-points=5000 --predict=10000 --batch-size=8 --model-layers=3 --state-size=32 --final-layers=2 --final-units=128 --plot --dry --from-files lstm sine.csv
    '''

    arguments = parse_arguments(sys.argv[1:])
    print(arguments)

    setup_workspace(arguments)

    data = load_data(arguments)
    graph = build_graph(arguments, data.number_of_streams)

    metrics = train_graph(graph, data, arguments)
    save_results(arguments, metrics)


 if __name__ == '__main__':
    main()