katsugeneration · May 15, 2016 07:28
diff --git a/skip_thoughts_word2vec.py b/skip_thoughts_word2vec.py
 # coding:utf-8

 import chainer
 from chainer import cuda
 import chainer.links as L
 import chainer.functions as F
 from chainer import optimizers
 from chainer import serializers

 from chainer.functions.activation import sigmoid
 from chainer.functions.activation import tanh
 from chainer import link
 from chainer.links.connection import linear

 import argparse
 import math
 import sys
 import time
 import numpy as np
 import six
 import struct
 import utils

 class ConditionalStatefulGRU(link.Chain):
    def __init__(self, n_inputs, n_units, n_cond):
            super(ConditionalStatefulGRU, self).__init__(
                W_r=linear.Linear(n_inputs, n_units),
                U_r=linear.Linear(n_units, n_units),
                W_z=linear.Linear(n_inputs, n_units),
                U_z=linear.Linear(n_units, n_units),
                W=linear.Linear(n_inputs, n_units),
                U=linear.Linear(n_units, n_units),
                C_r=linear.Linear(n_cond, n_units),
                C_z=linear.Linear(n_cond, n_units),
                C=linear.Linear(n_cond, n_units),
            )
            self.reset_state()

    def to_cpu(self):
        super(ConditionalStatefulGRU, self).to_cpu()
        if self.h is not None:
            self.h.to_cpu()

    def to_gpu(self, device=None):
        super(ConditionalStatefulGRU, self).to_gpu(device)
        if self.h is not None:
            self.h.to_gpu(device)

    def set_state(self, h):
        assert isinstance(h, chainer.Variable)
        h_ = h
        if self.xp == np:
            h_.to_cpu()
        else:
            h_.to_gpu()
        self.h = h_

    def reset_state(self):
        self.h = None
        
    def __call__(self, x, cond):
        z = self.W_z(x)
        h_bar = self.W(x)
        if self.h is not None:
            r = sigmoid.sigmoid(self.W_r(x) + self.U_r(self.h) + self.C_r(cond))
            z += self.U_z(self.h)
            h_bar += self.U(r * self.h)
        z = sigmoid.sigmoid(z + self.C_z(cond))
        h_bar = tanh.tanh(h_bar + self.C(cond))

        h_new = z * h_bar
        if self.h is not None:
            h_new += (1 - z) * self.h
        self.h = h_new
        return self.h


 class SkipThought(chainer.Chain):
    def __init__(self, n_vocab, n_dec_units, n_cond, train=True):
        super(SkipThought, self).__init__(
            encoder=L.StatefulGRU(n_vocab, n_cond),
            decoder_b=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
            decoder_a=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
            embed_dec=L.Linear(n_dec_units, n_vocab),
        )
        self.train = train
        self.n_vocab = n_vocab
        self.n_cond = n_cond
        self.n_dec_units = n_dec_units
        self.volatile = 'off' if train else 'on'
        
    def __call__(self, x_words, y_words, z_words):
        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))
        
        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))
        
        cond = self.encode(x_words)
        cost_a = cost_b = 0
        word = get_word_vectors('<eos>')
        for next in y_words.data:
            y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            y_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
            emb_b = self.decoder_b(F.dropout(y_c), cond)
            next_b = self.embed_dec(F.dropout(emb_b))
            cost_b += F.mean_squared_error(next_b, y_n)
            word = next
            # print(np.argmax(next_b.data), next, cost_b.data)
        
        word = get_word_vectors('<eos>')
        for next in z_words.data:
            z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            z_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
            emb_a = self.decoder_a(F.dropout(z_c), cond)
            next_a = self.embed_dec(F.dropout(emb_b))
            cost_a += F.mean_squared_error(next_a, z_n)
            word = next
        
        self.loss = cost_a + cost_b
        return self.loss
        
    def encode(self, x_words):
        h = np.ndarray((1, self.n_cond), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_cond)
        self.encoder.set_state(chainer.Variable(h, volatile=self.volatile))
        
        for word in x_words.data:
            x = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            cond = self.encoder(F.dropout(x, train=self.train))

        return cond
        
    def decode(self, x_words, stop_words, vocab, voc_inv, word_size, vector_size, cluster):
        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))
        
        h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
        h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
        self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))
        
        cond = self.encode(x_words)
        back_sentence = []
        ahead_sentence = []
        word = get_word_vectors('<eos>')
        next = 0
        count = 0
        while(voc_inv[next] != stop_words and count <= 50):
            # print(self.decoder_b.h.data)
            y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
            emb_b = self.decoder_b(y_c, cond)
            next_b = self.embed_dec(emb_b)
            # print(next_b.data / utils.norm(next_b.data[0]))
            cluster_b = cluster.predict(next_b.data / utils.norm(next_b.data[0]))
            now = time.time()
            cluster_voc = [key for (i, key) in enumerate(vocab) if cluster.labels_[i] == cluster_b]
            error_b = chainer.Variable(self.xp.array([[-((next_b.data[0] / utils.norm(next_b.data[0]) - vocab[value]) ** 2).mean() for value in cluster_voc]], dtype=np.float32))
            print(time.time() - now)
            prob_b = F.softmax(error_b).data[0]
            next = np.random.multinomial(1, prob_b).argmax()
            print(cluster_b, next, prob_b[next])
            word = get_word_vectors(cluster_voc[next])
            back_sentence.append(cluster_voc[next])
            count += 1
        
        # word = get_word_vectors('<eos>')
        # next = 0
        # count = 0
        # while(voc_inv[next] != stop_words and count <= 50):
        #     z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
        #     emb_a = self.decoder_a(z_c, cond)
        #     next_a = self.embed_dec(emb_b)
        #     error_a = chainer.Variable(self.xp.array([[-((next_a.data[0] - [vocab[voc_inv[count]]]) ** 2).mean() for count in six.moves.range(word_size)]], dtype=np.float32))
        #     prob_a = F.softmax(error_a).data[0] - 0.0001
        #     next = np.random.multinomial(1, prob_a).argmax()
        #     word = get_word_vectors(next)
        #     ahead_sentence.append(next)
        #     count += 1
        
        return back_sentence, ahead_sentence
        
 parser = argparse.ArgumentParser()
 parser.add_argument('--source', '-s', default='', 
                    help='source text separated words')
 parser.add_argument('--dict', '-d', default='', 
                    help='word vector dictionary')
 parser.add_argument('--initmodel', '-m', default='',
                    help='Initialize the model from given file')
 parser.add_argument('--gpu', '-g', default=-1, type=int,
                    help='GPU ID (negative value indicates CPU)')
 parser.add_argument('--epoch', '-e', default=20, type=int,
                    help='number of epochs to learn')
 parser.add_argument('--unit', '-u', default=2400, type=int,
                    help='number of units')
 parser.add_argument('--batchsize', '-b', type=int, default=20,
                    help='learning minibatch size')
 parser.add_argument('--gradclip', '-c', type=int, default=5,
                    help='gradient norm threshold to clip')
 parser.add_argument('--test', dest='test', action='store_true')
 parser.add_argument('--decode', dest='decode', action='store_true')
 parser.set_defaults(test=False)
 parser.set_defaults(decode=False)

 args = parser.parse_args()
 xp = cuda.cupy if args.gpu >= 0 else np

 n_epoch = args.epoch   # number of epochs
 n_units = args.unit  # number of units per layer
 batchsize = args.batchsize   # minibatch size
 grad_clip = args.gradclip    # gradient norm threshold to clip

 def get_word_vectors(word):
    global vocab
    if word in vocab:
        return vocab[word]
    else:
        print(word)
        ret = np.zeros((vector_size,), dtype=np.float32)
        ret[0] = 1.
        return ret

 vocab, voc_inv, word_size, vector_size = utils.load_word2vec(args.dict)
 km = utils.make_cluster(list(vocab.values()), 50)
 print(km.counts_)

 # Prepare RNNLM model, defined in net.py
 model = SkipThought(vector_size, n_units, n_units, train=not args.decode)
 model.compute_accuracy = False  # we only want the perplexity
 for param in model.params():
    data = param.data
    data[:] = np.random.uniform(-0.1, 0.1, data.shape)
 if args.gpu >= 0:
    cuda.get_device(args.gpu).use()
    model.to_gpu()

 # Setup optimizer
 optimizer = optimizers.Adam(alpha=0.0002, beta1=0.1, beta2=0.001, eps=1e-8)
 optimizer.setup(model)

 # Init/Resume
 if args.initmodel:
    print('Load model from', args.initmodel)
    serializers.load_npz(args.initmodel, model)

 if args.decode:
    i = 0
    train_data = open(args.source)
    for paragraph in train_data:
        paragraph.replace('\n', '').strip()
        for line in paragraph.split('。'):
            words = line.strip().split()
            if len(words) == 0:
                continue
            i += 1
            words.append('。')
            words.append('<eos>')
            print(words)
            x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in words]), volatile='on')
            back, ahead = model.decode(x, '.', vocab, voc_inv, word_size, vector_size, km)
            print(back)
            print()
        if args.test:
            if i >= 100:
                break
    
 else:
    # Learning loop
    cur_log_perp = xp.zeros(())
    epoch = 0
    start_at = time.time()
    cur_at = start_at
    accum_loss = 0
    print('going to train {} iterations'.format(n_epoch))

    i = 0
    for epoch in six.moves.range(n_epoch):
        back_words = None
        current_words = None
        ahead_words = None
        train_data = open(args.source)
        for paragraph in train_data:
            paragraph.replace('\n', '').strip()
            for line in paragraph.split('。'):
                words = line.strip().split()
                if len(words) == 0:
                    continue
                i += 1
                back_words = current_words
                current_words = ahead_words
                ahead_words = words
                ahead_words.append('。')
                ahead_words.append('<eos>')
                
                if back_words is None or current_words is None:
                    continue
                        
                x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in current_words]))
                back = chainer.Variable(xp.asarray([get_word_vectors(word) for word in back_words]))
                ahead = chainer.Variable(xp.asarray([get_word_vectors(word) for word in ahead_words]))
                loss_i = model(x, back, ahead)
                accum_loss += loss_i
                cur_log_perp += model.loss.data

                if (i + 1) % batchsize == 0:  # Run truncated BPTT
                    model.zerograds()
                    accum_loss.backward()
                    accum_loss.unchain_backward()
                    optimizer.update()
                    accum_loss = 0
                    
                if (i + 1) % 100 == 0:
                    now = time.time()
                    throuput = 100. / (now - cur_at)
                    perp = float(cur_log_perp) / 100
                    print('iter {} training perplexity: {:.2f} ({:.2f} iters/sec)'.format(
                        i + 1, perp, throuput))
                    cur_at = now
                    cur_log_perp.fill(0)
                    
                if (i + 1) % 10000 == 0:
                    # Save the model and the optimizer
                    model.to_cpu()
                    print('save the model')
                    serializers.save_npz('skip-thought.model', model)
                    print('save the optimizer')
                    serializers.save_npz('skip-thought.state', optimizer)

                sys.stdout.flush()
            if args.test:
                if i >= 100 * (epoch + 1):
                    break

    # Save the model and the optimizer
    model.to_cpu()
    print('save the model')
    serializers.save_npz('skip-thought.model', model)
    print('save the optimizer')
    serializers.save_npz('skip-thought.state', optimizer)
	# coding:utf-8

	import chainer
	from chainer import cuda
	import chainer.links as L
	import chainer.functions as F
	from chainer import optimizers
	from chainer import serializers

	from chainer.functions.activation import sigmoid
	from chainer.functions.activation import tanh
	from chainer import link
	from chainer.links.connection import linear

	import argparse
	import math
	import sys
	import time
	import numpy as np
	import six
	import struct
	import utils

	class ConditionalStatefulGRU(link.Chain):
	def __init__(self, n_inputs, n_units, n_cond):
	super(ConditionalStatefulGRU, self).__init__(
	W_r=linear.Linear(n_inputs, n_units),
	U_r=linear.Linear(n_units, n_units),
	W_z=linear.Linear(n_inputs, n_units),
	U_z=linear.Linear(n_units, n_units),
	W=linear.Linear(n_inputs, n_units),
	U=linear.Linear(n_units, n_units),
	C_r=linear.Linear(n_cond, n_units),
	C_z=linear.Linear(n_cond, n_units),
	C=linear.Linear(n_cond, n_units),
	)
	self.reset_state()

	def to_cpu(self):
	super(ConditionalStatefulGRU, self).to_cpu()
	if self.h is not None:
	self.h.to_cpu()

	def to_gpu(self, device=None):
	super(ConditionalStatefulGRU, self).to_gpu(device)
	if self.h is not None:
	self.h.to_gpu(device)

	def set_state(self, h):
	assert isinstance(h, chainer.Variable)
	h_ = h
	if self.xp == np:
	h_.to_cpu()
	else:
	h_.to_gpu()
	self.h = h_

	def reset_state(self):
	self.h = None

	def __call__(self, x, cond):
	z = self.W_z(x)
	h_bar = self.W(x)
	if self.h is not None:
	r = sigmoid.sigmoid(self.W_r(x) + self.U_r(self.h) + self.C_r(cond))
	z += self.U_z(self.h)
	h_bar += self.U(r * self.h)
	z = sigmoid.sigmoid(z + self.C_z(cond))
	h_bar = tanh.tanh(h_bar + self.C(cond))

	h_new = z * h_bar
	if self.h is not None:
	h_new += (1 - z) * self.h
	self.h = h_new
	return self.h


	class SkipThought(chainer.Chain):
	def __init__(self, n_vocab, n_dec_units, n_cond, train=True):
	super(SkipThought, self).__init__(
	encoder=L.StatefulGRU(n_vocab, n_cond),
	decoder_b=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
	decoder_a=ConditionalStatefulGRU(n_vocab, n_dec_units, n_cond),
	embed_dec=L.Linear(n_dec_units, n_vocab),
	)
	self.train = train
	self.n_vocab = n_vocab
	self.n_cond = n_cond
	self.n_dec_units = n_dec_units
	self.volatile = 'off' if train else 'on'

	def __call__(self, x_words, y_words, z_words):
	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))

	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))

	cond = self.encode(x_words)
	cost_a = cost_b = 0
	word = get_word_vectors('<eos>')
	for next in y_words.data:
	y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	y_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
	emb_b = self.decoder_b(F.dropout(y_c), cond)
	next_b = self.embed_dec(F.dropout(emb_b))
	cost_b += F.mean_squared_error(next_b, y_n)
	word = next
	# print(np.argmax(next_b.data), next, cost_b.data)

	word = get_word_vectors('<eos>')
	for next in z_words.data:
	z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	z_n = chainer.Variable(self.xp.array([next], dtype=np.float32), volatile=self.volatile)
	emb_a = self.decoder_a(F.dropout(z_c), cond)
	next_a = self.embed_dec(F.dropout(emb_b))
	cost_a += F.mean_squared_error(next_a, z_n)
	word = next

	self.loss = cost_a + cost_b
	return self.loss

	def encode(self, x_words):
	h = np.ndarray((1, self.n_cond), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_cond)
	self.encoder.set_state(chainer.Variable(h, volatile=self.volatile))

	for word in x_words.data:
	x = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	cond = self.encoder(F.dropout(x, train=self.train))

	return cond

	def decode(self, x_words, stop_words, vocab, voc_inv, word_size, vector_size, cluster):
	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_b.set_state(chainer.Variable(h, volatile=self.volatile))

	h = np.ndarray((1, self.n_dec_units), dtype=np.float32)
	h[0][:] = np.random.uniform(-0.1, 0.1, self.n_dec_units)
	self.decoder_a.set_state(chainer.Variable(h, volatile=self.volatile))

	cond = self.encode(x_words)
	back_sentence = []
	ahead_sentence = []
	word = get_word_vectors('<eos>')
	next = 0
	count = 0
	while(voc_inv[next] != stop_words and count <= 50):
	# print(self.decoder_b.h.data)
	y_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	emb_b = self.decoder_b(y_c, cond)
	next_b = self.embed_dec(emb_b)
	# print(next_b.data / utils.norm(next_b.data[0]))
	cluster_b = cluster.predict(next_b.data / utils.norm(next_b.data[0]))
	now = time.time()
	cluster_voc = [key for (i, key) in enumerate(vocab) if cluster.labels_[i] == cluster_b]
	error_b = chainer.Variable(self.xp.array([[-((next_b.data[0] / utils.norm(next_b.data[0]) - vocab[value]) ** 2).mean() for value in cluster_voc]], dtype=np.float32))
	print(time.time() - now)
	prob_b = F.softmax(error_b).data[0]
	next = np.random.multinomial(1, prob_b).argmax()
	print(cluster_b, next, prob_b[next])
	word = get_word_vectors(cluster_voc[next])
	back_sentence.append(cluster_voc[next])
	count += 1

	# word = get_word_vectors('<eos>')
	# next = 0
	# count = 0
	# while(voc_inv[next] != stop_words and count <= 50):
	# z_c = chainer.Variable(self.xp.array([word], dtype=np.float32), volatile=self.volatile)
	# emb_a = self.decoder_a(z_c, cond)
	# next_a = self.embed_dec(emb_b)
	# error_a = chainer.Variable(self.xp.array([[-((next_a.data[0] - [vocab[voc_inv[count]]]) ** 2).mean() for count in six.moves.range(word_size)]], dtype=np.float32))
	# prob_a = F.softmax(error_a).data[0] - 0.0001
	# next = np.random.multinomial(1, prob_a).argmax()
	# word = get_word_vectors(next)
	# ahead_sentence.append(next)
	# count += 1

	return back_sentence, ahead_sentence

	parser = argparse.ArgumentParser()
	parser.add_argument('--source', '-s', default='',
	help='source text separated words')
	parser.add_argument('--dict', '-d', default='',
	help='word vector dictionary')
	parser.add_argument('--initmodel', '-m', default='',
	help='Initialize the model from given file')
	parser.add_argument('--gpu', '-g', default=-1, type=int,
	help='GPU ID (negative value indicates CPU)')
	parser.add_argument('--epoch', '-e', default=20, type=int,
	help='number of epochs to learn')
	parser.add_argument('--unit', '-u', default=2400, type=int,
	help='number of units')
	parser.add_argument('--batchsize', '-b', type=int, default=20,
	help='learning minibatch size')
	parser.add_argument('--gradclip', '-c', type=int, default=5,
	help='gradient norm threshold to clip')
	parser.add_argument('--test', dest='test', action='store_true')
	parser.add_argument('--decode', dest='decode', action='store_true')
	parser.set_defaults(test=False)
	parser.set_defaults(decode=False)

	args = parser.parse_args()
	xp = cuda.cupy if args.gpu >= 0 else np

	n_epoch = args.epoch # number of epochs
	n_units = args.unit # number of units per layer
	batchsize = args.batchsize # minibatch size
	grad_clip = args.gradclip # gradient norm threshold to clip

	def get_word_vectors(word):
	global vocab
	if word in vocab:
	return vocab[word]
	else:
	print(word)
	ret = np.zeros((vector_size,), dtype=np.float32)
	ret[0] = 1.
	return ret

	vocab, voc_inv, word_size, vector_size = utils.load_word2vec(args.dict)
	km = utils.make_cluster(list(vocab.values()), 50)
	print(km.counts_)

	# Prepare RNNLM model, defined in net.py
	model = SkipThought(vector_size, n_units, n_units, train=not args.decode)
	model.compute_accuracy = False # we only want the perplexity
	for param in model.params():
	data = param.data
	data[:] = np.random.uniform(-0.1, 0.1, data.shape)
	if args.gpu >= 0:
	cuda.get_device(args.gpu).use()
	model.to_gpu()

	# Setup optimizer
	optimizer = optimizers.Adam(alpha=0.0002, beta1=0.1, beta2=0.001, eps=1e-8)
	optimizer.setup(model)

	# Init/Resume
	if args.initmodel:
	print('Load model from', args.initmodel)
	serializers.load_npz(args.initmodel, model)

	if args.decode:
	i = 0
	train_data = open(args.source)
	for paragraph in train_data:
	paragraph.replace('\n', '').strip()
	for line in paragraph.split('。'):
	words = line.strip().split()
	if len(words) == 0:
	continue
	i += 1
	words.append('。')
	words.append('<eos>')
	print(words)
	x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in words]), volatile='on')
	back, ahead = model.decode(x, '.', vocab, voc_inv, word_size, vector_size, km)
	print(back)
	print()
	if args.test:
	if i >= 100:
	break

	else:
	# Learning loop
	cur_log_perp = xp.zeros(())
	epoch = 0
	start_at = time.time()
	cur_at = start_at
	accum_loss = 0
	print('going to train {} iterations'.format(n_epoch))

	i = 0
	for epoch in six.moves.range(n_epoch):
	back_words = None
	current_words = None
	ahead_words = None
	train_data = open(args.source)
	for paragraph in train_data:
	paragraph.replace('\n', '').strip()
	for line in paragraph.split('。'):
	words = line.strip().split()
	if len(words) == 0:
	continue
	i += 1
	back_words = current_words
	current_words = ahead_words
	ahead_words = words
	ahead_words.append('。')
	ahead_words.append('<eos>')

	if back_words is None or current_words is None:
	continue

	x = chainer.Variable(xp.asarray([get_word_vectors(word) for word in current_words]))
	back = chainer.Variable(xp.asarray([get_word_vectors(word) for word in back_words]))
	ahead = chainer.Variable(xp.asarray([get_word_vectors(word) for word in ahead_words]))
	loss_i = model(x, back, ahead)
	accum_loss += loss_i
	cur_log_perp += model.loss.data

	if (i + 1) % batchsize == 0: # Run truncated BPTT
	model.zerograds()
	accum_loss.backward()
	accum_loss.unchain_backward()
	optimizer.update()
	accum_loss = 0

	if (i + 1) % 100 == 0:
	now = time.time()
	throuput = 100. / (now - cur_at)
	perp = float(cur_log_perp) / 100
	print('iter {} training perplexity: {:.2f} ({:.2f} iters/sec)'.format(
	i + 1, perp, throuput))
	cur_at = now
	cur_log_perp.fill(0)

	if (i + 1) % 10000 == 0:
	# Save the model and the optimizer
	model.to_cpu()
	print('save the model')
	serializers.save_npz('skip-thought.model', model)
	print('save the optimizer')
	serializers.save_npz('skip-thought.state', optimizer)

	sys.stdout.flush()
	if args.test:
	if i >= 100 * (epoch + 1):
	break

	# Save the model and the optimizer
	model.to_cpu()
	print('save the model')
	serializers.save_npz('skip-thought.model', model)
	print('save the optimizer')
	serializers.save_npz('skip-thought.state', optimizer)