-
-
Save roshanraj/9ca9af35264e7d19babace2b510a183f to your computer and use it in GitHub Desktop.
load pre-trained word2vec into cnn-text-classification-tf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import numpy as np | |
class TextCNN(object): | |
""" | |
A CNN for text classification. | |
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. | |
""" | |
def __init__( | |
self, sequence_length, num_classes, vocab_size, | |
embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): | |
# Placeholders for input, output and dropout | |
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") | |
self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") | |
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") | |
# Keeping track of l2 regularization loss (optional) | |
l2_loss = tf.constant(0.0) | |
# Embedding layer | |
with tf.device('/cpu:0'), tf.name_scope("embedding"): | |
self.W = tf.Variable( | |
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), | |
name="W") | |
self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) | |
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) | |
# Create a convolution + maxpool layer for each filter size | |
pooled_outputs = [] | |
for i, filter_size in enumerate(filter_sizes): | |
with tf.name_scope("conv-maxpool-%s" % filter_size): | |
# Convolution Layer | |
filter_shape = [filter_size, embedding_size, 1, num_filters] | |
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") | |
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") | |
conv = tf.nn.conv2d( | |
self.embedded_chars_expanded, | |
W, | |
strides=[1, 1, 1, 1], | |
padding="VALID", | |
name="conv") | |
# Apply nonlinearity | |
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") | |
# Maxpooling over the outputs | |
pooled = tf.nn.max_pool( | |
h, | |
ksize=[1, sequence_length - filter_size + 1, 1, 1], | |
strides=[1, 1, 1, 1], | |
padding='VALID', | |
name="pool") | |
pooled_outputs.append(pooled) | |
# Combine all the pooled features | |
num_filters_total = num_filters * len(filter_sizes) | |
self.h_pool = tf.concat(3, pooled_outputs) | |
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) | |
# Add dropout | |
with tf.name_scope("dropout"): | |
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) | |
# Final (unnormalized) scores and predictions | |
with tf.name_scope("output"): | |
W = tf.get_variable( | |
"W", | |
shape=[num_filters_total, num_classes], | |
initializer=tf.contrib.layers.xavier_initializer()) | |
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") | |
l2_loss += tf.nn.l2_loss(W) | |
l2_loss += tf.nn.l2_loss(b) | |
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") | |
self.predictions = tf.argmax(self.scores, 1, name="predictions") | |
# CalculateMean cross-entropy loss | |
with tf.name_scope("loss"): | |
losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y) | |
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss | |
# Accuracy | |
with tf.name_scope("accuracy"): | |
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) | |
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import numpy as np | |
import os | |
import time | |
import datetime | |
import data_helpers | |
import tensorflow as tf | |
from text_cnn import TextCNN | |
from tensorflow.contrib import learn | |
# Parameters | |
# ================================================== | |
# Model Hyperparameters | |
tf.flags.DEFINE_string("word2vec", None, "Word2vec file with pre-trained embeddings (default: None)") | |
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") | |
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") | |
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") | |
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") | |
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)") | |
# Training parameters | |
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") | |
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") | |
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") | |
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") | |
# Misc Parameters | |
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") | |
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") | |
FLAGS = tf.flags.FLAGS | |
FLAGS._parse_flags() | |
print("\nParameters:") | |
for attr, value in sorted(FLAGS.__flags.items()): | |
print("{}={}".format(attr.upper(), value)) | |
print("") | |
# Data Preparatopn | |
# ================================================== | |
# Load data | |
print("Loading data...") | |
x_text, y = data_helpers.load_data_and_labels() | |
# Build vocabulary | |
max_document_length = max([len(x.split(" ")) for x in x_text]) | |
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) | |
x = np.array(list(vocab_processor.fit_transform(x_text))) | |
# Randomly shuffle data | |
np.random.seed(10) | |
shuffle_indices = np.random.permutation(np.arange(len(y))) | |
x_shuffled = x[shuffle_indices] | |
y_shuffled = y[shuffle_indices] | |
# Split train/test set | |
# TODO: This is very crude, should use cross-validation | |
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] | |
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] | |
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) | |
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) | |
# Training | |
# ================================================== | |
with tf.Graph().as_default(): | |
session_conf = tf.ConfigProto( | |
allow_soft_placement=FLAGS.allow_soft_placement, | |
log_device_placement=FLAGS.log_device_placement) | |
sess = tf.Session(config=session_conf) | |
with sess.as_default(): | |
cnn = TextCNN( | |
sequence_length=x_train.shape[1], | |
num_classes=2, | |
vocab_size=len(vocab_processor.vocabulary_), | |
embedding_size=FLAGS.embedding_dim, | |
filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), | |
num_filters=FLAGS.num_filters, | |
l2_reg_lambda=FLAGS.l2_reg_lambda) | |
# Define Training procedure | |
global_step = tf.Variable(0, name="global_step", trainable=False) | |
optimizer = tf.train.AdamOptimizer(1e-3) | |
grads_and_vars = optimizer.compute_gradients(cnn.loss) | |
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) | |
# Keep track of gradient values and sparsity (optional) | |
grad_summaries = [] | |
for g, v in grads_and_vars: | |
if g is not None: | |
grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) | |
sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) | |
grad_summaries.append(grad_hist_summary) | |
grad_summaries.append(sparsity_summary) | |
grad_summaries_merged = tf.merge_summary(grad_summaries) | |
# Output directory for models and summaries | |
timestamp = str(int(time.time())) | |
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) | |
print("Writing to {}\n".format(out_dir)) | |
# Summaries for loss and accuracy | |
loss_summary = tf.scalar_summary("loss", cnn.loss) | |
acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) | |
# Train Summaries | |
train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) | |
train_summary_dir = os.path.join(out_dir, "summaries", "train") | |
train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph) | |
# Dev summaries | |
dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) | |
dev_summary_dir = os.path.join(out_dir, "summaries", "dev") | |
dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph) | |
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it | |
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) | |
checkpoint_prefix = os.path.join(checkpoint_dir, "model") | |
if not os.path.exists(checkpoint_dir): | |
os.makedirs(checkpoint_dir) | |
saver = tf.train.Saver(tf.all_variables()) | |
# Write vocabulary | |
vocab_processor.save(os.path.join(out_dir, "vocab")) | |
# Initialize all variables | |
sess.run(tf.initialize_all_variables()) | |
# Initialize all variables | |
sess.run(tf.initialize_all_variables()) | |
if FLAGS.word2vec: | |
# initial matrix with random uniform | |
initW = np.random.uniform(-0.25,0.25,(len(vocab_processor.vocabulary_), FLAGS.embedding_dim)) | |
# load any vectors from the word2vec | |
print("Load word2vec file {}\n".format(FLAGS.word2vec)) | |
with open(FLAGS.word2vec, "rb") as f: | |
header = f.readline() | |
vocab_size, layer1_size = map(int, header.split()) | |
binary_len = np.dtype('float32').itemsize * layer1_size | |
for line in xrange(vocab_size): | |
word = [] | |
while True: | |
ch = f.read(1) | |
if ch == ' ': | |
word = ''.join(word) | |
break | |
if ch != '\n': | |
word.append(ch) | |
idx = vocab_processor.vocabulary_.get(word) | |
if idx != 0: | |
initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') | |
else: | |
f.read(binary_len) | |
sess.run(cnn.W.assign(initW)) | |
def train_step(x_batch, y_batch): | |
""" | |
A single training step | |
""" | |
feed_dict = { | |
cnn.input_x: x_batch, | |
cnn.input_y: y_batch, | |
cnn.dropout_keep_prob: FLAGS.dropout_keep_prob | |
} | |
_, step, summaries, loss, accuracy = sess.run( | |
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], | |
feed_dict) | |
time_str = datetime.datetime.now().isoformat() | |
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) | |
train_summary_writer.add_summary(summaries, step) | |
def dev_step(x_batch, y_batch, writer=None): | |
""" | |
Evaluates model on a dev set | |
""" | |
feed_dict = { | |
cnn.input_x: x_batch, | |
cnn.input_y: y_batch, | |
cnn.dropout_keep_prob: 1.0 | |
} | |
step, summaries, loss, accuracy = sess.run( | |
[global_step, dev_summary_op, cnn.loss, cnn.accuracy], | |
feed_dict) | |
time_str = datetime.datetime.now().isoformat() | |
print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) | |
if writer: | |
writer.add_summary(summaries, step) | |
# Generate batches | |
batches = data_helpers.batch_iter( | |
list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) | |
# Training loop. For each batch... | |
for batch in batches: | |
x_batch, y_batch = zip(*batch) | |
train_step(x_batch, y_batch) | |
current_step = tf.train.global_step(sess, global_step) | |
if current_step % FLAGS.evaluate_every == 0: | |
print("\nEvaluation:") | |
dev_step(x_dev, y_dev, writer=dev_summary_writer) | |
print("") | |
if current_step % FLAGS.checkpoint_every == 0: | |
path = saver.save(sess, checkpoint_prefix, global_step=current_step) | |
print("Saved model checkpoint to {}\n".format(path)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment