Last active
June 21, 2018 05:08
-
-
Save Orbifold/cc30752f37ab47bbb017b83b8c2774b3 to your computer and use it in GitHub Desktop.
Textual entailment training using TensorFlow.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import tensorflow as tf | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import matplotlib.ticker as ticker | |
| import urllib | |
| import sys | |
| import os | |
| import zipfile | |
| glove_vectors_file = "glove.6B.50d.txt" | |
| snli_dev_file = "snli_1.0_dev.txt" | |
| snli_full_dataset_file = "snli_1.0_train.txt" | |
| glove_wordmap = {} | |
| with open(glove_vectors_file, "r", encoding="utf-8") as glove: | |
| for line in glove: | |
| name, vector = tuple(line.split(" ", 1)) | |
| glove_wordmap[name] = np.fromstring(vector, sep=" ") | |
| def sentence2sequence(sentence): | |
| tokens = sentence.lower().split(" ") | |
| rows = [] | |
| words = [] | |
| #Greedy search for tokens | |
| for token in tokens: | |
| i = len(token) | |
| while len(token) > 0 and i > 0: | |
| word = token[:i] | |
| if word in glove_wordmap: | |
| rows.append(glove_wordmap[word]) | |
| words.append(word) | |
| token = token[i:] | |
| i = len(token) | |
| else: | |
| i = i-1 | |
| return rows, words | |
| rnn_size = 64 | |
| rnn = tf.contrib.rnn.BasicRNNCell(rnn_size) | |
| #Constants setup | |
| max_hypothesis_length, max_evidence_length = 30, 30 | |
| batch_size, vector_size, hidden_size = 128, 50, 64 | |
| lstm_size = hidden_size | |
| weight_decay = 0.0001 | |
| learning_rate = 1 | |
| input_p, output_p = 0.5, 0.5 | |
| training_iterations_count = 100000 | |
| display_step = 10 | |
| def score_setup(row): | |
| convert_dict = { | |
| 'entailment': 0, | |
| 'neutral': 1, | |
| 'contradiction': 2 | |
| } | |
| score = np.zeros((3,)) | |
| for x in range(1,6): | |
| tag = row["label"+str(x)] | |
| if tag in convert_dict: score[convert_dict[tag]] += 1 | |
| return score / (1.0*np.sum(score)) | |
| def fit_to_size(matrix, shape): | |
| res = np.zeros(shape) | |
| slices = [slice(0,min(dim,shape[e])) for e, dim in enumerate(matrix.shape)] | |
| res[slices] = matrix[slices] | |
| return res | |
| def split_data_into_scores(): | |
| import csv | |
| with open(snli_dev_file,"r") as data: | |
| train = csv.DictReader(data, delimiter='\t') | |
| evi_sentences = [] | |
| hyp_sentences = [] | |
| labels = [] | |
| scores = [] | |
| for row in train: | |
| hyp_sentences.append(np.vstack( | |
| sentence2sequence(row["sentence1"].lower())[0])) | |
| evi_sentences.append(np.vstack( | |
| sentence2sequence(row["sentence2"].lower())[0])) | |
| labels.append(row["gold_label"]) | |
| scores.append(score_setup(row)) | |
| hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size)) | |
| for x in hyp_sentences]) | |
| evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size)) | |
| for x in evi_sentences]) | |
| return (hyp_sentences, evi_sentences), labels, np.array(scores) | |
| data_feature_list, correct_values, correct_scores = split_data_into_scores() | |
| l_h, l_e = max_hypothesis_length, max_evidence_length | |
| N, D, H = batch_size, vector_size, hidden_size | |
| l_seq = l_h + l_e | |
| tf.reset_default_graph() | |
| lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size) | |
| lstm_drop = tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p) | |
| # N: The number of elements in each of our batches, | |
| # which we use to train subsets of data for efficiency's sake. | |
| # l_h: The maximum length of a hypothesis, or the second sentence. This is | |
| # used because training an RNN is extraordinarily difficult without | |
| # rolling it out to a fixed length. | |
| # l_e: The maximum length of evidence, the first sentence. This is used | |
| # because training an RNN is extraordinarily difficult without | |
| # rolling it out to a fixed length. | |
| # D: The size of our used GloVe or other vectors. | |
| hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis') | |
| evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence') | |
| y = tf.placeholder(tf.float32, [N, 3], 'label') | |
| # hyp: Where the hypotheses will be stored during training. | |
| # evi: Where the evidences will be stored during training. | |
| # y: Where correct scores will be stored during training. | |
| # lstm_size: the size of the gates in the LSTM, | |
| # as in the first LSTM layer's initialization. | |
| lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size) | |
| # lstm_back: The LSTM used for looking backwards | |
| # through the sentences, similar to lstm. | |
| # input_p: the probability that inputs to the LSTM will be retained at each | |
| # iteration of dropout. | |
| # output_p: the probability that outputs from the LSTM will be retained at | |
| # each iteration of dropout. | |
| lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p) | |
| # lstm_drop_back: A dropout wrapper for lstm_back, like lstm_drop. | |
| fc_initializer = tf.random_normal_initializer(stddev=0.1) | |
| # fc_initializer: initial values for the fully connected layer's weights. | |
| # hidden_size: the size of the outputs from each lstm layer. | |
| # Multiplied by 2 to account for the two LSTMs. | |
| fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3], | |
| initializer = fc_initializer) | |
| # fc_weight: Storage for the fully connected layer's weights. | |
| fc_bias = tf.get_variable('bias', [3]) | |
| # fc_bias: Storage for the fully connected layer's bias. | |
| # tf.GraphKeys.REGULARIZATION_LOSSES: A key to a collection in the graph | |
| # designated for losses due to regularization. | |
| # In this case, this portion of loss is regularization on the weights | |
| # for the fully connected layer. | |
| tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, | |
| tf.nn.l2_loss(fc_weight)) | |
| x = tf.concat([hyp, evi], 1) # N, (Lh+Le), d | |
| # Permuting batch_size and n_steps | |
| x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d | |
| # Reshaping to (n_steps*batch_size, n_input) | |
| x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d | |
| # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) | |
| x = tf.split(x, l_seq,) | |
| # x: the inputs to the bidirectional_rnn | |
| # tf.contrib.rnn.static_bidirectional_rnn: Runs the input through | |
| # two recurrent networks, one that runs the inputs forward and one | |
| # that runs the inputs in reversed order, combining the outputs. | |
| rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back, | |
| x, dtype=tf.float32) | |
| # rnn_outputs: the list of LSTM outputs, as a list. | |
| # What we want is the latest output, rnn_outputs[-1] | |
| classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias | |
| # The scores are relative certainties for how likely the output matches | |
| # a certain entailment: | |
| # 0: Positive entailment | |
| # 1: Neutral entailment | |
| # 2: Negative entailment | |
| with tf.variable_scope('Accuracy'): | |
| predicts = tf.cast(tf.argmax(classification_scores, 1), 'int32') | |
| y_label = tf.cast(tf.argmax(y, 1), 'int32') | |
| corrects = tf.equal(predicts, y_label) | |
| num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) | |
| accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) | |
| with tf.variable_scope("loss"): | |
| cross_entropy = tf.nn.softmax_cross_entropy_with_logits( | |
| logits = classification_scores, labels = y) | |
| loss = tf.reduce_mean(cross_entropy) | |
| total_loss = loss + weight_decay * tf.add_n( | |
| tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) | |
| optimizer = tf.train.GradientDescentOptimizer(learning_rate) | |
| opt_op = optimizer.minimize(total_loss) | |
| # Initialize variables | |
| init = tf.global_variables_initializer() | |
| # Use TQDM if installed | |
| tqdm_installed = False | |
| try: | |
| from tqdm import tqdm | |
| tqdm_installed = True | |
| except: | |
| pass | |
| # Launch the Tensorflow session | |
| sess = tf.Session() | |
| writer = tf.summary.FileWriter('/log', sess.graph) # write to file | |
| merge_op = tf.summary.merge_all() # operation to merge all summary | |
| sess.run(init) | |
| # training_iterations_count: The number of data pieces to train on in total | |
| # batch_size: The number of data pieces per batch | |
| training_iterations = range(0,training_iterations_count,batch_size) | |
| if tqdm_installed: | |
| # Add a progress bar if TQDM is installed | |
| training_iterations = tqdm(training_iterations) | |
| for i in training_iterations: | |
| # Select indices for a random data subset | |
| batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size) | |
| # Use the selected subset indices to initialize the graph's | |
| # placeholder values | |
| hyps, evis, ys = (data_feature_list[0][batch,:], | |
| data_feature_list[1][batch,:], | |
| correct_scores[batch]) | |
| # Run the optimization with these initialized values | |
| r = sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys}) | |
| # display_step: how often the accuracy and loss should | |
| # be tested and displayed. | |
| if (i/batch_size) % display_step == 0: | |
| # Calculate batch accuracy | |
| acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys}) | |
| # Calculate batch loss | |
| tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys}) | |
| # Display results | |
| print("Iter " + str(i/batch_size) + ", Minibatch Loss= " + \ | |
| "{:.6f}".format(tmp_loss) + ", Training Accuracy= " + \ | |
| "{:.5f}".format(acc)) | |
| summary = tf.Summary(value=[tf.Summary.Value(tag="Accuracy", | |
| simple_value=acc)]) | |
| writer.add_summary(summary, i) | |
| evidences = ["Janos and Jade both were at the scene of the car crash."] | |
| hypotheses = ["Multiple people saw the accident."] | |
| sentence1 = [fit_to_size(np.vstack(sentence2sequence(evidence)[0]), | |
| (30, 50)) for evidence in evidences] | |
| sentence2 = [fit_to_size(np.vstack(sentence2sequence(hypothesis)[0]), | |
| (30,50)) for hypothesis in hypotheses] | |
| prediction = sess.run(classification_scores, feed_dict={hyp: (sentence1 * N), | |
| evi: (sentence2 * N), | |
| y: [[0,0,0]]*N}) | |
| print(["Positive", "Neutral", "Negative"][np.argmax(prediction[0])]+ | |
| " entailment") | |
| sess.close() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You need SNLI data from https://nlp.stanford.edu/projects/snli as well as the GloVe embedding data here https://nlp.stanford.edu/projects/glove/
Inspired by https://www.oreilly.com/learning/textual-entailment-with-tensorflow