-
-
Save ilblackdragon/c92066d9d38b236a21d5a7b729a10f12 to your computer and use it in GitHub Desktop.
import logging | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow.contrib import layers | |
GO_TOKEN = 0 | |
END_TOKEN = 1 | |
UNK_TOKEN = 2 | |
def seq2seq(mode, features, labels, params): | |
vocab_size = params['vocab_size'] | |
embed_dim = params['embed_dim'] | |
num_units = params['num_units'] | |
input_max_length = params['input_max_length'] | |
output_max_length = params['output_max_length'] | |
inp = features['input'] | |
output = features['output'] | |
batch_size = tf.shape(inp)[0] | |
start_tokens = tf.zeros([batch_size], dtype=tf.int64) | |
train_output = tf.concat([tf.expand_dims(start_tokens, 1), output], 1) | |
input_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(inp, 1)), 1) | |
output_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(train_output, 1)), 1) | |
input_embed = layers.embed_sequence( | |
inp, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed') | |
output_embed = layers.embed_sequence( | |
train_output, vocab_size=vocab_size, embed_dim=embed_dim, scope='embed', reuse=True) | |
with tf.variable_scope('embed', reuse=True): | |
embeddings = tf.get_variable('embeddings') | |
cell = tf.contrib.rnn.GRUCell(num_units=num_units) | |
encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, input_embed, dtype=tf.float32) | |
train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths) | |
# train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( | |
# output_embed, output_lengths, embeddings, 0.3 | |
# ) | |
pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( | |
embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1) | |
def decode(helper, scope, reuse=None): | |
with tf.variable_scope(scope, reuse=reuse): | |
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( | |
num_units=num_units, memory=encoder_outputs, | |
memory_sequence_length=input_lengths) | |
cell = tf.contrib.rnn.GRUCell(num_units=num_units) | |
attn_cell = tf.contrib.seq2seq.AttentionWrapper( | |
cell, attention_mechanism, attention_layer_size=num_units / 2) | |
out_cell = tf.contrib.rnn.OutputProjectionWrapper( | |
attn_cell, vocab_size, reuse=reuse | |
) | |
decoder = tf.contrib.seq2seq.BasicDecoder( | |
cell=out_cell, helper=helper, | |
initial_state=out_cell.zero_state( | |
dtype=tf.float32, batch_size=batch_size)) | |
#initial_state=encoder_final_state) | |
outputs = tf.contrib.seq2seq.dynamic_decode( | |
decoder=decoder, output_time_major=False, | |
impute_finished=True, maximum_iterations=output_max_length | |
) | |
return outputs[0] | |
train_outputs = decode(train_helper, 'decode') | |
pred_outputs = decode(pred_helper, 'decode', reuse=True) | |
tf.identity(train_outputs.sample_id[0], name='train_pred') | |
weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1)) | |
loss = tf.contrib.seq2seq.sequence_loss( | |
train_outputs.rnn_output, output, weights=weights) | |
train_op = layers.optimize_loss( | |
loss, tf.train.get_global_step(), | |
optimizer=params.get('optimizer', 'Adam'), | |
learning_rate=params.get('learning_rate', 0.001), | |
summaries=['loss', 'learning_rate']) | |
tf.identity(pred_outputs.sample_id[0], name='predictions') | |
return tf.estimator.EstimatorSpec( | |
mode=mode, | |
predictions=pred_outputs.sample_id, | |
loss=loss, | |
train_op=train_op | |
) | |
def tokenize_and_map(line, vocab): | |
return [vocab.get(token, UNK_TOKEN) for token in line.split(' ')] | |
def make_input_fn( | |
batch_size, input_filename, output_filename, vocab, | |
input_max_length, output_max_length, | |
input_process=tokenize_and_map, output_process=tokenize_and_map): | |
def input_fn(): | |
inp = tf.placeholder(tf.int64, shape=[None, None], name='input') | |
output = tf.placeholder(tf.int64, shape=[None, None], name='output') | |
tf.identity(inp[0], 'input_0') | |
tf.identity(output[0], 'output_0') | |
return { | |
'input': inp, | |
'output': output, | |
}, None | |
def sampler(): | |
while True: | |
with open(input_filename) as finput: | |
with open(output_filename) as foutput: | |
for in_line in finput: | |
out_line = foutput.readline() | |
yield { | |
'input': input_process(in_line, vocab)[:input_max_length - 1] + [END_TOKEN], | |
'output': output_process(out_line, vocab)[:output_max_length - 1] + [END_TOKEN] | |
} | |
sample_me = sampler() | |
def feed_fn(): | |
inputs, outputs = [], [] | |
input_length, output_length = 0, 0 | |
for i in range(batch_size): | |
rec = sample_me.next() | |
inputs.append(rec['input']) | |
outputs.append(rec['output']) | |
input_length = max(input_length, len(inputs[-1])) | |
output_length = max(output_length, len(outputs[-1])) | |
# Pad me right with </S> token. | |
for i in range(batch_size): | |
inputs[i] += [END_TOKEN] * (input_length - len(inputs[i])) | |
outputs[i] += [END_TOKEN] * (output_length - len(outputs[i])) | |
return { | |
'input:0': inputs, | |
'output:0': outputs | |
} | |
return input_fn, feed_fn | |
def load_vocab(filename): | |
vocab = {} | |
with open(filename) as f: | |
for idx, line in enumerate(f): | |
vocab[line.strip()] = idx | |
return vocab | |
def get_rev_vocab(vocab): | |
return {idx: key for key, idx in vocab.iteritems()} | |
def get_formatter(keys, vocab): | |
rev_vocab = get_rev_vocab(vocab) | |
def to_str(sequence): | |
tokens = [ | |
rev_vocab.get(x, "<UNK>") for x in sequence] | |
return ' '.join(tokens) | |
def format(values): | |
res = [] | |
for key in keys: | |
res.append("%s = %s" % (key, to_str(values[key]))) | |
return '\n'.join(res) | |
return format | |
def train_seq2seq( | |
input_filename, output_filename, vocab_filename, | |
model_dir): | |
vocab = load_vocab(vocab_filename) | |
params = { | |
'vocab_size': len(vocab), | |
'batch_size': 32, | |
'input_max_length': 30, | |
'output_max_length': 30, | |
'embed_dim': 100, | |
'num_units': 256 | |
} | |
est = tf.estimator.Estimator( | |
model_fn=seq2seq, | |
model_dir=model_dir, params=params) | |
input_fn, feed_fn = make_input_fn( | |
params['batch_size'], | |
input_filename, | |
output_filename, | |
vocab, params['input_max_length'], params['output_max_length']) | |
# Make hooks to print examples of inputs/predictions. | |
print_inputs = tf.train.LoggingTensorHook( | |
['input_0', 'output_0'], every_n_iter=100, | |
formatter=get_formatter(['input_0', 'output_0'], vocab)) | |
print_predictions = tf.train.LoggingTensorHook( | |
['predictions', 'train_pred'], every_n_iter=100, | |
formatter=get_formatter(['predictions', 'train_pred'], vocab)) | |
est.train( | |
input_fn=input_fn, | |
hooks=[tf.train.FeedFnHook(feed_fn), print_inputs, print_predictions], | |
steps=10000) | |
def main(): | |
tf.logging._logger.setLevel(logging.INFO) | |
train_seq2seq('input', 'output', 'vocab', 'model/seq2seq') |
As you use BahdanauAttention in your code, why didn't set output_attention (AttentionWrapper) to False?
Following this example, wouldn't the first END_TOKEN target symbol be given zero weight?
@robmsylvester I think this is correct, as it does not matter to the model whether feeding the END_TOKEN to the decoder gives the correct next word or not.
I want to load a pre-trained embeddings. Should i change the layers.embed_sequence
to tf.nn.embedding_lookup
?
Can someone tell me what is attention_layer_size
and why is it num_units/2
?
Can anyone guide me about how to use BahdanauAttention. when i use it like "tf.contrib.seq2seq.BahdanauAttention" or "seq2seq.BahdanauAttention" in either case i get an error"AttributeError: module 'tensorflow.contrib.seq2seq' has no attribute 'BahdanauAttention'. I'm using tensorflow 1.0.1. I'm new to deep learning. Any suggestion regarding will be appreciated.
Thank you.
Data generator and full binary here: https://github.com/ilblackdragon/tf_examples/blob/master/seq2seq/