Created
November 21, 2016 05:17
-
-
Save blvp/88af1cfb08fcf0b47ec4b408ce6421d0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gzip | |
| import pickle | |
| import numpy as np | |
| import tensorflow as tf | |
| import tflearn | |
| from gensim.models.word2vec import Word2Vec | |
| from tflearn.data_utils import to_categorical, pad_sequences | |
| from tflearn.datasets import imdb | |
| from tflearn.layers.conv import conv_1d, global_max_pool, conv_2d | |
| from tflearn.layers.core import input_data, dropout, fully_connected | |
| from tflearn.layers.estimator import regression | |
| from tflearn.layers.merge_ops import merge | |
| dropout_keep_prob = 0.5 | |
| batch_size = 50 | |
| n_classes = 2 | |
| embedding_dim = 300 | |
| filter_sizes = [2, 3, 4, 5] | |
| num_filters = 100 | |
| snapshot_every = 200 | |
| def load_embeddings(filename, vocab): | |
| """ | |
| :param filename: | |
| :type vocab: dict | |
| :param vocab: | |
| :return: | |
| """ | |
| w2v = Word2Vec.load_word2vec_format(filename, binary=True) | |
| # initial matrix with random uniform | |
| initW = np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim)) | |
| for word, idx in vocab.iteritems(): | |
| if word in w2v: | |
| initW[idx] = w2v[word] | |
| return initW | |
| def load_imdb_vocab(path="imdb.dict.pkl"): | |
| path = imdb.get_dataset_file( | |
| path, "imdb.dict.pkl", | |
| "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.dict.pkl.gz") | |
| if path.endswith(".gz"): | |
| f = gzip.open(path, 'rb') | |
| else: | |
| f = open(path, 'rb') | |
| vocab = pickle.load(f) | |
| return vocab | |
| vocab = load_imdb_vocab("imdb.dict.pkl") | |
| print "loaded vocab, size: {}".format(len(vocab)) | |
| vocab_size = len(vocab) | |
| embeddings = load_embeddings('GoogleNews-vectors-negative300.bin', vocab) | |
| # vocab_size = 102098 | |
| # IMDB Dataset loading | |
| train, test, _ = imdb.load_data(path='imdb.pkl', n_words=vocab_size, | |
| valid_portion=0.1) | |
| X_train, y_train = train | |
| X_test, y_test = test | |
| # Data preprocessing | |
| # Sequence padding | |
| max_document_length = 280 | |
| X_train = pad_sequences(X_train, maxlen=max_document_length, value=0.) | |
| X_test = pad_sequences(X_test, maxlen=max_document_length, value=0.) | |
| # Converting labels to binary vectors | |
| y_train = to_categorical(y_train, nb_classes=n_classes) | |
| y_test = to_categorical(y_test, nb_classes=n_classes) | |
| max_document_length = X_train.shape[1] | |
| print "Loaded data" | |
| print "Max document length {}".format(max_document_length) | |
| # Building convolutional network | |
| network = input_data(shape=[None, max_document_length], name='input') | |
| with tf.name_scope('embedding'): | |
| net1 = tflearn.embedding(network, input_dim=vocab_size, | |
| output_dim=embedding_dim, trainable=False, | |
| name="static-embedding") | |
| net2 = tflearn.embedding(network, input_dim=vocab_size, | |
| output_dim=embedding_dim, trainable=True, | |
| name="non-static") | |
| net1 = tf.expand_dims(net1, -1) | |
| net2 = tf.expand_dims(net2, -1) | |
| # network = merge([net1, net2], 'concat', axis=2, name='CombineTwoEmbeddings') | |
| branches = [] | |
| for filter_size in filter_sizes: | |
| # branch = conv_2d(network, num_filters, [filter_size, max_document_length, 1, num_filters], | |
| # padding='valid', activation='relu', name="conv-{}".format(filter_size)) | |
| # branches.append(branch) | |
| with tf.name_scope("conv-maxpool-%s" % filter_size): | |
| # Convolution Layer | |
| filter_shape = [filter_size, embedding_dim, 1, num_filters] | |
| W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") | |
| b1 = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b1") | |
| b2 = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b1") | |
| def conv_shr(input, b): | |
| conv = tf.nn.conv2d( | |
| input, | |
| W, | |
| strides=[1, 1, 1, 1], | |
| padding="VALID", | |
| name="conv") | |
| # Apply nonlinearity | |
| h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") | |
| # Maxpooling over the outputs | |
| pooled = tf.nn.max_pool( | |
| h, | |
| ksize=[1, max_document_length - filter_size + 1, 1, 1], | |
| strides=[1, 1, 1, 1], | |
| padding='VALID', | |
| name="pool") | |
| return pooled | |
| pooled1 = conv_shr(net1, b1) | |
| pooled2 = conv_shr(net2, b2) | |
| branches.append(pooled1) | |
| branches.append(pooled2) | |
| network = merge(branches, mode='concat', axis=3) | |
| network = dropout(network, dropout_keep_prob) | |
| network = fully_connected(network, n_classes, activation='softmax') | |
| network = regression(network, optimizer='adam', learning_rate=0.001, | |
| loss='categorical_crossentropy', name='target') | |
| # Training | |
| model = tflearn.DNN(network, tensorboard_verbose=0) | |
| model.fit(X_train, y_train, | |
| n_epoch=5, shuffle=True, | |
| validation_set=(X_test, y_test), | |
| show_metric=True, | |
| batch_size=batch_size, | |
| snapshot_step=snapshot_every | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment