Created
May 15, 2016 07:28
-
-
Save katsugeneration/4306aada0dceb73f6ee5377ec24b77fb to your computer and use it in GitHub Desktop.
Skip-Thought Vectors を word2vec を入出力にして行う実験用コード
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
import chainer | |
from chainer import cuda | |
import chainer.links as L | |
import chainer.functions as F | |
from chainer import optimizers | |
from chainer import serializers | |
from chainer.functions.activation import sigmoid | |
from chainer.functions.activation import tanh | |
from chainer import link | |
from chainer.links.connection import linear | |
import argparse | |
import math | |
import sys | |
import time | |
import numpy as np | |
import six | |
import struct | |
import utils | |
class ConditionalStatefulGRU(link.Chain): | |
def __init__(self, n_inputs, n_units, n_cond): | |
super(ConditionalStatefulGRU, self).__init__( | |
W_r=linear.Linear(n_inputs, n_units), | |
U_r=linear.Linear(n_units, n_units), | |
W_z=linear.Linear(n_inputs, n_units), | |
U_z=linear.Linear(n_units, n_units), | |
W=linear.Linear(n_inputs, n_units), | |
U=linear.Linear(n_units, n_units), | |
C_r=linear.Linear(n_cond, n_units), | |
C_z=linear.Linear(n_cond, n_units), | |
C=linear.Linear(n_cond, n_units), | |
) | |
self.reset_state() | |
def to_cpu(self): | |
super(ConditionalStatefulGRU, self).to_cpu() | |
if self.h is not None: | |
self.h.to_cpu() | |
def to_gpu(self, device=None): | |
super(ConditionalStatefulGRU, self).to_gpu(device) | |
if self.h is not None: | |
self.h.to_gpu(device) | |
def set_state(self, h): | |
assert isinstance(h, chainer.Variable) | |
h_ = h | |
if self.xp == np: | |
h_.to_cpu() | |
else: | |
h_.to_gpu() | |
self.h = h_ | |
def reset_state(self): | |
self.h = None | |
def __call__(self, x, cond): | |
z = self.W_z(x) | |
h_bar = self.W(x) | |
if self.h is not None: | |
r = sigmoid.sigmoid(self.W_r(x) + self.U_r(self.h) + self.C_r(cond)) | |
z += self.U_z(self.h) | |
h_bar += self.U(r * self.h) | |
z = sigmoid.sigmoid(z + self.C_z(cond)) | |
h_bar = tanh.tanh(h_bar + self.C(cond)) | |
h_new = z * h_bar | |
if self.h is not None: | |
h_new += (1 - z) * self.h | |
self.h = h_new | |
return self.h | |
class SkipThought(chainer.Chain): | |
def __init__(self, n_vocab, n_dec_units, n_cond, train=True): | |
super(SkipThought, self).__init__( | |
encoder=L.StatefulGRU(n_vocab, n_cond), | |
decoder_b=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond), | |
decoder_a=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond), | |
embed_dec=L.Linear(n_dec_units, n_vocab), | |
) | |
self.train = train | |
self.n_vocab = n_vocab | |
self.n_cond = n_cond | |
self.n_dec_units = n_dec_units | |
self.volatile = 'off' if train else 'on' | |
def __call__(self, x_words, y_words, z_words): | |
h = np.ndarray((1, self.n_dec_units), dtype=np.float32) | |
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units) | |
self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile)) | |
h = np.ndarray((1, self.n_dec_units), dtype=np.float32) | |
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units) | |
self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile)) | |
cond = self.encode(x_words) | |
cost_a = cost_b = 0 | |
word = get_word_vectors('<eos>') | |
for next in y_words.data: | |
y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile) | |
y_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile) | |
emb_b = self.decoder_b(F.dropout(y_c), cond) | |
next_b = self.embed_dec(F.dropout(emb_b)) | |
cost_b += F.mean_squared_error(next_b, y_n) | |
word = next | |
# print(np.argmax(next_b.data), next, cost_b.data) | |
word = get_word_vectors('<eos>') | |
for next in z_words.data: | |
z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile) | |
z_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile) | |
emb_a = self.decoder_a(F.dropout(z_c), cond) | |
next_a = self.embed_dec(F.dropout(emb_b)) | |
cost_a += F.mean_squared_error(next_a, z_n) | |
word = next | |
self.loss = cost_a + cost_b | |
return self.loss | |
def encode(self, x_words): | |
h = np.ndarray((1, self.n_cond), dtype=np.float32) | |
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_cond) | |
self.encoder.set_state(chainer.Variable(h, volatile=self.volatile)) | |
for word in x_words.data: | |
x = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile) | |
cond = self.encoder(F.dropout(x, train=self.train)) | |
return cond | |
def decode(self, x_words, stop_words, vocab, voc_inv, word_size, vector_size, cluster): | |
h = np.ndarray((1, self.n_dec_units), dtype=np.float32) | |
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units) | |
self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile)) | |
h = np.ndarray((1, self.n_dec_units), dtype=np.float32) | |
h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units) | |
self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile)) | |
cond = self.encode(x_words) | |
back_sentence = [] | |
ahead_sentence = [] | |
word = get_word_vectors('<eos>') | |
next = 0 | |
count = 0 | |
while(voc_inv[next] != stop_words and count <= 50): | |
# print(self.decoder_b.h.data) | |
y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile) | |
emb_b = self.decoder_b(y_c, cond) | |
next_b = self.embed_dec(emb_b) | |
# print(next_b.data / utils.norm(next_b.data[0])) | |
cluster_b = cluster.predict(next_b.data / utils.norm(next_b.data[0])) | |
now = time.time() | |
cluster_voc = [key for (i, key) in enumerate(vocab) if cluster.labels_[i] == cluster_b] | |
error_b = chainer.Variable(self.xp.array([[-((next_b.data[0] / utils.norm(next_b.data[0]) - vocab[value]) ** 2).mean() for value in cluster_voc]], dtype=np.float32)) | |
print(time.time() - now) | |
prob_b = F.softmax(error_b).data[0] | |
next = np.random.multinomial(1, prob_b).argmax() | |
print(cluster_b, next, prob_b[next]) | |
word = get_word_vectors(cluster_voc[next]) | |
back_sentence.append(cluster_voc[next]) | |
count += 1 | |
# word = get_word_vectors('<eos>') | |
# next = 0 | |
# count = 0 | |
# while(voc_inv[next] != stop_words and count <= 50): | |
# z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile) | |
# emb_a = self.decoder_a(z_c, cond) | |
# next_a = self.embed_dec(emb_b) | |
# error_a = chainer.Variable(self.xp.array([[-((next_a.data[0] - [vocab[voc_inv[count]]]) ** 2).mean() for count in six.moves.range(word_size)]], dtype=np.float32)) | |
# prob_a = F.softmax(error_a).data[0] - 0.0001 | |
# next = np.random.multinomial(1, prob_a).argmax() | |
# word = get_word_vectors(next) | |
# ahead_sentence.append(next) | |
# count += 1 | |
return back_sentence, ahead_sentence | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--source', '-s', default='', | |
help='source text separated words') | |
parser.add_argument('--dict', '-d', default='', | |
help='word vector dictionary') | |
parser.add_argument('--initmodel', '-m', default='', | |
help='Initialize the model from given file') | |
parser.add_argument('--gpu', '-g', default=-1, type=int, | |
help='GPU ID (negative value indicates CPU)') | |
parser.add_argument('--epoch', '-e', default=20, type=int, | |
help='number of epochs to learn') | |
parser.add_argument('--unit', '-u', default=2400, type=int, | |
help='number of units') | |
parser.add_argument('--batchsize', '-b', type=int, default=20, | |
help='learning minibatch size') | |
parser.add_argument('--gradclip', '-c', type=int, default=5, | |
help='gradient norm threshold to clip') | |
parser.add_argument('--test', dest='test', action='store_true') | |
parser.add_argument('--decode', dest='decode', action='store_true') | |
parser.set_defaults(test=False) | |
parser.set_defaults(decode=False) | |
args = parser.parse_args() | |
xp = cuda.cupy if args.gpu >= 0 else np | |
n_epoch = args.epoch # number of epochs | |
n_units = args.unit # number of units per layer | |
batchsize = args.batchsize # minibatch size | |
grad_clip = args.gradclip # gradient norm threshold to clip | |
def get_word_vectors(word): | |
global vocab | |
if word in vocab: | |
return vocab[word] | |
else: | |
print(word) | |
ret = np.zeros((vector_size,), dtype=np.float32) | |
ret[0] = 1. | |
return ret | |
vocab, voc_inv, word_size, vector_size = utils.load_word2vec(args.dict) | |
km = utils.make_cluster(list(vocab.values()), 50) | |
print(km.counts_) | |
# Prepare RNNLM model, defined in net.py | |
model = SkipThought(vector_size, n_units, n_units, train=not args.decode) | |
model.compute_accuracy = False # we only want the perplexity | |
for param in model.params(): | |
data = param.data | |
data[:] = np.random.uniform(-0.1, 0.1, data.shape) | |
if args.gpu >= 0: | |
cuda.get_device(args.gpu).use() | |
model.to_gpu() | |
# Setup optimizer | |
optimizer = optimizers.Adam(alpha=0.0002, beta1=0.1, beta2=0.001, eps=1e-8) | |
optimizer.setup(model) | |
# Init/Resume | |
if args.initmodel: | |
print('Load model from', args.initmodel) | |
serializers.load_npz(args.initmodel, model) | |
if args.decode: | |
i = 0 | |
train_data = open(args.source) | |
for paragraph in train_data: | |
paragraph.replace('\n', '').strip() | |
for line in paragraph.split('。'): | |
words = line.strip().split() | |
if len(words) == 0: | |
continue | |
i += 1 | |
words.append('。') | |
words.append('<eos>') | |
print(words) | |
x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in words]), volatile='on') | |
back, ahead = model.decode(x, '.', vocab, voc_inv, word_size, vector_size, km) | |
print(back) | |
print() | |
if args.test: | |
if i >= 100: | |
break | |
else: | |
# Learning loop | |
cur_log_perp = xp.zeros(()) | |
epoch = 0 | |
start_at = time.time() | |
cur_at = start_at | |
accum_loss = 0 | |
print('going to train {} iterations'.format(n_epoch)) | |
i = 0 | |
for epoch in six.moves.range(n_epoch): | |
back_words = None | |
current_words = None | |
ahead_words = None | |
train_data = open(args.source) | |
for paragraph in train_data: | |
paragraph.replace('\n', '').strip() | |
for line in paragraph.split('。'): | |
words = line.strip().split() | |
if len(words) == 0: | |
continue | |
i += 1 | |
back_words = current_words | |
current_words = ahead_words | |
ahead_words = words | |
ahead_words.append('。') | |
ahead_words.append('<eos>') | |
if back_words is None or current_words is None: | |
continue | |
x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in current_words])) | |
back = chainer.Variable(xp.asarray([get_word_vectors(word) for word in back_words])) | |
ahead = chainer.Variable(xp.asarray([get_word_vectors(word) for word in ahead_words])) | |
loss_i = model(x, back, ahead) | |
accum_loss += loss_i | |
cur_log_perp += model.loss.data | |
if (i + 1) % batchsize == 0: # Run truncated BPTT | |
model.zerograds() | |
accum_loss.backward() | |
accum_loss.unchain_backward() | |
optimizer.update() | |
accum_loss = 0 | |
if (i + 1) % 100 == 0: | |
now = time.time() | |
throuput = 100. / (now - cur_at) | |
perp = float(cur_log_perp) / 100 | |
print('iter {} training perplexity: {:.2f} ({:.2f} iters/sec)'.format( | |
i + 1, perp, throuput)) | |
cur_at = now | |
cur_log_perp.fill(0) | |
if (i + 1) % 10000 == 0: | |
# Save the model and the optimizer | |
model.to_cpu() | |
print('save the model') | |
serializers.save_npz('skip-thought.model', model) | |
print('save the optimizer') | |
serializers.save_npz('skip-thought.state', optimizer) | |
sys.stdout.flush() | |
if args.test: | |
if i >= 100 * (epoch + 1): | |
break | |
# Save the model and the optimizer | |
model.to_cpu() | |
print('save the model') | |
serializers.save_npz('skip-thought.model', model) | |
print('save the optimizer') | |
serializers.save_npz('skip-thought.state', optimizer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment