Created
April 14, 2017 03:06
-
-
Save marekgalovic/ad4022c2b7f2e2014168ff03c1edca1e to your computer and use it in GitHub Desktop.
Quora question pairs - decomposable NLI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Decomposable attention model for NLI | |
# https://arxiv.org/pdf/1606.01933v1.pdf | |
dnli_graph = tf.Graph() | |
with dnli_graph.as_default(): | |
embedding_matrix = tf.Variable(tf.zeros([DICTIONARY_SIZE, EMBEDDING_SIZE]), name='word_embeddings', trainable=False) | |
embedding_placeholder = tf.placeholder(tf.float32, [DICTIONARY_SIZE, EMBEDDING_SIZE]) | |
embedding_init_op = embedding_matrix.assign(embedding_placeholder) | |
X_Q1 = tf.placeholder(tf.int32, [None, None]) | |
X_Q2 = tf.placeholder(tf.int32, [None, None]) | |
y_ = tf.placeholder(tf.int32, [None, 2]) | |
is_training = tf.placeholder(tf.bool) | |
with tf.name_scope('embeddings_lookup'): | |
Q1_embeddings = tf.nn.embedding_lookup(embedding_matrix, X_Q1) | |
Q2_embeddings = tf.nn.embedding_lookup(embedding_matrix, X_Q2) | |
with tf.name_scope('attention'): | |
e_Q1 = tf.layers.dense(Q1_embeddings, EMBEDDING_SIZE, activation=tf.nn.relu, name='embedding_projection_nn') | |
e_Q2 = tf.layers.dense(Q2_embeddings, EMBEDDING_SIZE, activation=tf.nn.relu, name='embedding_projection_nn', reuse=True) | |
e = tf.matmul(e_Q1, tf.transpose(e_Q2, [0,2,1])) | |
beta = tf.matmul(tf.nn.softmax(e), Q2_embeddings) | |
alpha = tf.matmul(tf.nn.softmax(tf.transpose(e, [0,2,1])), Q1_embeddings) | |
with tf.name_scope('comparison'): | |
v_Q1 = tf.layers.dense(tf.concat([Q1_embeddings, beta], 2), EMBEDDING_SIZE, activation=tf.nn.relu, name='attention_nn') | |
v_Q2 = tf.layers.dense(tf.concat([Q2_embeddings, alpha], 2), EMBEDDING_SIZE, activation=tf.nn.relu, name='attention_nn', reuse=True) | |
v = tf.concat([ | |
tf.reduce_sum(v_Q1, 1), | |
tf.reduce_sum(v_Q2, 1) | |
], 1) | |
tf.summary.histogram('v_activations', v) | |
v_dropout = tf.layers.dropout(v, rate=0.3, training=is_training) | |
with tf.name_scope('classification'): | |
_L1 = tf.layers.dense(v_dropout, EMBEDDING_SIZE, activation=tf.nn.relu) | |
L1 = tf.layers.dropout(_L1, rate=0.3, training=is_training) | |
tf.summary.histogram('l1_activations', _L1) | |
_L2 = tf.layers.dense(L1, EMBEDDING_SIZE, activation=tf.nn.relu) | |
L2 = tf.layers.dropout(_L2, rate=0.2, training=is_training) | |
tf.summary.histogram('l2_activations', _L2) | |
y = tf.layers.dense(L2, 2, activation=tf.nn.sigmoid, name='classification_nn') | |
loss = tf.losses.log_loss(y_, y) | |
tf.summary.scalar('loss', loss) | |
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1)), tf.float32)) | |
tf.summary.scalar('accuracy', accuracy) | |
train_op = tf.train.AdamOptimizer().minimize(loss) | |
metrics_op = tf.summary.merge_all() | |
saver = tf.train.Saver(max_to_keep=50) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extended version of a vanilla decomposable NLI propsed in the paper. | |
# The extended version uses self sentence attention to learn which words are | |
# important within each sentence using a dense nn with shared weights for both | |
# sentences. Softmax activation is used to get attention weights which are | |
# then multiplied by original embedded sentences. | |
ednli_graph = tf.Graph() | |
with ednli_graph.as_default(): | |
embedding_matrix = tf.Variable(tf.zeros([DICTIONARY_SIZE, EMBEDDING_SIZE]), name='word_embeddings', trainable=False) | |
embedding_placeholder = tf.placeholder(tf.float32, [DICTIONARY_SIZE, EMBEDDING_SIZE]) | |
embedding_init_op = embedding_matrix.assign(embedding_placeholder) | |
X1 = tf.placeholder(tf.int32, [None, None]) | |
X2 = tf.placeholder(tf.int32, [None, None]) | |
y_ = tf.placeholder(tf.int32, [None, 2]) | |
is_training = tf.placeholder(tf.bool) | |
with tf.name_scope('embeddings_lookup'): | |
X1_embedded = tf.nn.embedding_lookup(embedding_matrix, X1) | |
X2_embedded = tf.nn.embedding_lookup(embedding_matrix, X2) | |
# Sentence self attention | |
with tf.name_scope('self_attention'): | |
X1_self_projection = tf.layers.dense(X1_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_self_attention') | |
X2_self_projection = tf.layers.dense(X2_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_self_attention', reuse=True) | |
X1_self_e = tf.matmul(X1_self_projection, tf.transpose(X1_self_projection, [0,2,1])) | |
X2_self_e = tf.matmul(X2_self_projection, tf.transpose(X2_self_projection, [0,2,1])) | |
X1_gamma = tf.matmul(tf.nn.softmax(X1_self_e), X1_embedded) | |
X2_gamma = tf.matmul(tf.nn.softmax(X2_self_e), X2_embedded) | |
# Inter sentence attention | |
with tf.name_scope('attention'): | |
X1_e = tf.layers.dense(X1_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_attention') | |
X2_e = tf.layers.dense(X2_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_attention', reuse=True) | |
e = tf.matmul(X1_e, tf.transpose(X2_e, [0,2,1])) | |
beta = tf.matmul(tf.nn.softmax(e), X2_e) | |
alpha = tf.matmul(tf.nn.softmax(tf.transpose(e, [0,2,1])), X1_e) | |
# Inter sentence comparison | |
with tf.name_scope('comparison'): | |
X1_v = tf.layers.dense(tf.concat([X1_embedded, beta, X1_gamma], 2), 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_comparison') | |
X2_v = tf.layers.dense(tf.concat([X2_embedded, alpha, X2_gamma], 2), 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_comparison', reuse=True) | |
# Reduce to BATCH_SIZExEMBEDDING_SIZE | |
with tf.name_scope('sum_reduction'): | |
v = tf.concat([ | |
tf.reduce_sum(X1_v, 1), | |
tf.reduce_sum(X2_v, 1), | |
], 1) | |
v_dropout = tf.layers.dropout(v, rate=0.3, training=is_training) | |
# Classification | |
with tf.name_scope('classification'): | |
L1 = tf.layers.dropout( | |
tf.layers.dense(v_dropout, 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='L1'), | |
rate = 0.3, training=is_training | |
) | |
tf.summary.histogram('l1_activations', L1) | |
L2 = tf.layers.dropout( | |
tf.layers.dense(L1, 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='L2'), | |
rate = 0.3, training=is_training | |
) | |
tf.summary.histogram('l2_activations', L2) | |
L3 = tf.layers.dropout( | |
tf.layers.dense(L2, EMBEDDING_SIZE, activation=tf.nn.relu, name='L3'), | |
rate = 0.2, training=is_training | |
) | |
tf.summary.histogram('l1_activations', L3) | |
y = tf.layers.dense(L3, 2, activation=tf.nn.sigmoid, name='y') | |
tf.summary.histogram('y_activations', y) | |
loss = tf.losses.log_loss(y_, y) | |
tf.summary.scalar('loss', loss) | |
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1)), tf.float32)) | |
tf.summary.scalar('accuracy', accuracy) | |
train_op = tf.train.AdamOptimizer().minimize(loss) | |
metrics_op = tf.summary.merge_all() | |
saver = tf.train.Saver(max_to_keep=50) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment