Last active
July 18, 2017 07:19
-
-
Save AppleHolic/2ca0d5c918c227bd3a4c0ad0863b60e0 to your computer and use it in GitHub Desktop.
Text Classification Training Code (mxnet)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mxnet as mx | |
import numpy as np | |
# make and return data iterator | |
def get_data_iter(data, tags, labels, shuffle=False, batch_size=64): | |
nditer = mx.io.NDArrayIter(data={'data' : data, 'tags' : tags}, label={'labels': labels}, batch_size=batch_size, shuffle=shuffle) | |
return nditer | |
# origins (precision recall functions) | |
''' | |
def precision(y_true, y_pred): | |
true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1))) | |
predicted_positives = np.sum(np.round(np.clip(y_pred, 0, 1))) | |
precision = true_positives / (predicted_positives + 1e-8) | |
return precision | |
def recall(y_true, y_pred): | |
true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1))) | |
possible_positives = np.sum(np.round(np.clip(y_true, 0, 1))) | |
recall = true_positives / (possible_positives + 1e-8) | |
return recall | |
''' | |
# top 3 metric functions | |
def precision(y_true, y_pred, topk=3): | |
# mark 1 on top 3 pred labels | |
topk_indices = [] | |
for idx in range(len(y_pred)): | |
topk_indices.append(y_pred[idx].argsort()[-topk:][::-1]) | |
temp = np.zeros_like(y_pred) | |
for idx in range(len(temp)): | |
temp[idx, topk_indices[idx]] = 1. | |
y_pred = temp | |
# multiply marks and trues on axis 1 | |
true_positives = np.sum(y_true * y_pred, axis=1) | |
# sum up only binary values | |
true_positives = np.sum(true_positives==np.sum(y_true, axis=1)) | |
predicted_positives = len(y_pred)# same as np.sum(y_pred)/topk | |
precision = true_positives / (predicted_positives + 1e-8) | |
return precision | |
def recall(y_true, y_pred, topk=3): | |
topk_indices = [] | |
for idx in range(len(y_pred)): | |
topk_indices.append(y_pred[idx].argsort()[-topk:][::-1]) | |
temp = np.zeros_like(y_pred) | |
for idx in range(len(temp)): | |
temp[idx, topk_indices[idx]] = 1. | |
y_pred = temp | |
true_positives = np.sum(y_true * y_pred, axis=1) | |
true_positives = np.sum(true_positives==np.sum(y_true, axis=1)) | |
possible_positives = np.sum(y_true) | |
recall = true_positives / (possible_positives + 1e-8) | |
return recall | |
def fbeta_score(y_true, y_pred, beta=1, prec=None, rec=None): | |
if beta < 0: | |
raise ValueError('The lowest choosable beta is zero (only precision).') | |
# If there are no true positives, fix the F score at 0 like sklearn. | |
if np.sum(np.round(np.clip(y_true, 0, 1))) == 0: | |
return 0 | |
p = precision(y_true, y_pred) | |
r = recall(y_true, y_pred) | |
bb = beta ** 2 | |
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + 1e-8) | |
return fbeta_score | |
def fmeasure(y_true, y_pred): | |
return fbeta_score(y_true, y_pred, beta=1) | |
#make custom operation layer | |
class CrossEntropyLoss(mx.operator.CustomOp): | |
eps = 1e-6 # Avoid -inf when taking log(0) | |
eps1 = 1. + eps | |
eps_1 = 1. - eps | |
def forward(self, is_train, req, in_data, out_data, aux): | |
# Shapes: | |
# b = minibatch size | |
# d = number of dimensions | |
actually_calculate_loss = False | |
if actually_calculate_loss: | |
p = in_data[0].asnumpy() # shape=(b,d) | |
y = in_data[1].asnumpy() | |
out = y * np.log(p+self.eps) + (1.-y) * np.log((self.eps1) - p) | |
self.assign(out_data[0], req[0], mx.nd.array(out)) | |
else: | |
# Just copy the predictions forward | |
self.assign(out_data[0], req[0], in_data[0]) | |
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): | |
#self.approx_backward(req, out_grad, in_data, out_data, in_grad, aux) | |
self.exact_backward(req, out_grad, in_data, out_data, in_grad, aux) | |
def approx_backward(self, req, out_grad, in_data, out_data, in_grad, aux): | |
"""Correct grad = (y-p)/(p-p^2) | |
But if y is just 1 or 0, then this simplifies to | |
grad = 1/(p-1+y) | |
which is more numerically stable | |
""" | |
p = in_data[0].asnumpy() # shape=(b,d) | |
y = in_data[1].asnumpy() | |
grad = 1. / (p - self.eps_1 + y) | |
self.assign(in_grad[0], req[0], mx.nd.array(grad)) | |
def exact_backward(self, req, out_grad, in_data, out_data, in_grad, aux): | |
"""grad = (y-p)/(p-p^2) | |
""" | |
p = in_data[0].asnumpy() # shape=(b,d) | |
y = in_data[1].asnumpy() # seems right | |
grad = (p - y) / ((p+self.eps) * (self.eps1 - p)) | |
self.assign(in_grad[0], req[0], mx.nd.array(grad)) | |
# adoption custom oprator in mxnet | |
@mx.operator.register("CrossEntropyLoss") | |
class CrossEntropyProp(mx.operator.CustomOpProp): | |
def __init__(self): | |
super(CrossEntropyProp, self).__init__(need_top_grad=False) | |
def list_arguments(self): | |
return ['data','label'] | |
def list_outputs(self): | |
return ['preds'] | |
def create_operator(self, ctx, shapes, dtypes): | |
return CrossEntropyLoss() | |
def infer_shape(self, in_shape): | |
if in_shape[0] != in_shape[1]: | |
raise ValueError("Input shapes differ. data:%s. label:%s. must be same" | |
% (str(in_shape[0]),str(in_shape[1]))) | |
output_shape = in_shape[0] | |
return in_shape, [output_shape], [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mxnet as mx | |
import os | |
import logging | |
class Network(object): | |
''' | |
Initialize Model Params and get model by given model name! | |
You just put the model name(function name) into model params for getting model symbol. | |
''' | |
def __init__(self, model_params): | |
self.__init_params__(model_params) | |
def __init_params__(self, model_params): | |
for key, val in model_params.iteritems(): | |
setattr(self, key, val) | |
def get_model(self, name): | |
return getattr(self, name)() | |
def __input_part(self): | |
data = mx.sym.Variable('data') | |
tags = mx.sym.Variable('tags') | |
labels = mx.sym.Variable('labels') | |
data_embed = mx.sym.Embedding(data=data, input_dim=self.max_features+1, output_dim=self.embedding_dims, name='embed_data') | |
tags_embed = mx.sym.Embedding(data=tags, input_dim=self.nb_tags+1, output_dim=1, name='embed_tags') | |
concat = mx.sym.Concat(data_embed, tags_embed, dim=2) | |
return data, tags, labels, concat | |
def get_lstm_cell(self, inputs, stack_rnn=True, bi_direction=True, num_layers=1, num_hidden=32, dropout=0.5): | |
# build cell | |
if stack_rnn: | |
cell = mx.rnn.SequentialRNNCell() | |
for i in range(num_layers): | |
cell.add(mx.rnn.FusedRNNCell(num_hidden, num_layers=1, mode='lstm', prefix='lstm_l%d' % i, bidirectional=bi_direction)) | |
if dropout > 0 and i < num_layers-1: | |
cell.add(mx.rnn.DropoutCell(dropout, prefix='lstm_d%d' % i)) | |
else: | |
cell = mx.rnn.FusedRNNCell(num_hidden, num_layers=num_layers, dropout=dropout, | |
mode='lstm', bidirectional=bi_direction) | |
output, _ = cell.unroll(self.maxlen, inputs=inputs) | |
return output | |
def fast_text_lstm(self): | |
main_input, tag_input, labels, network = self.__input_part() | |
network = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden) | |
network = mx.sym.transpose(data=network, axes=(0, 2, 1)) | |
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg') | |
network = mx.sym.Dropout(network, p=0.5) | |
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred') | |
network = mx.sym.Activation(network, act_type='sigmoid') | |
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss') | |
return network | |
def fast_text_lstm_maxpool(self): | |
main_input, tag_input, labels, network = self.__input_part() | |
network = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden) | |
network = mx.sym.transpose(data=network, axes=(0, 2, 1)) | |
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='max') | |
network = mx.sym.Dropout(network, p=0.5) | |
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred') | |
network = mx.sym.Activation(network, act_type='sigmoid') | |
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss') | |
return network | |
def fast_text(self): | |
main_input, tag_input, labels, network = self.__input_part() | |
network = mx.sym.transpose(data=network, axes=(0, 2, 1)) | |
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg') | |
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred') | |
network = mx.sym.Activation(network, act_type='sigmoid') | |
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss') | |
return network | |
def fast_text_conv(self): | |
main_input, tag_input, labels, network = self.__input_part() | |
network = mx.sym.transpose(data=network, axes=(0, 2, 1)) | |
network = mx.sym.Convolution(data=network, kernel=(3,), num_filter=self.filters) | |
network = mx.sym.Dropout(data=network, p=0.5) | |
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg') | |
network = mx.sym.Dropout(network, p=0.5) | |
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred') | |
network = mx.sym.Activation(network, act_type='sigmoid') | |
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss') | |
return network | |
def bi_directional_lstm(self): | |
main_input, tag_input, labels, network = self.__input_part() | |
network = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden) | |
network = mx.sym.Reshape(network, shape=(-1, self.nb_hidden*2)) | |
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred') | |
network = mx.sym.Dropout(network, p=0.5) | |
network = mx.sym.Activation(network, act_type='sigmoid') | |
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss') | |
return network | |
def combine_model1(self): | |
main_input, tag_input, labels, network = self.__input_part() | |
# conv part | |
network_part1 = mx.sym.transpose(data=network, axes=(0, 2, 1)) | |
network_part1 = mx.sym.Convolution(data=network_part1, kernel=(3,), num_filter=self.filters) | |
network_part1 = mx.sym.Dropout(data=network_part1, p=0.5) | |
# lstm part | |
network_part2 = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden) | |
network_part2 = mx.sym.transpose(data=network_part2, axes=(0, 2, 1)) | |
network = mx.sym.Concat(network_part1, network_part2, dim=2) | |
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg') | |
network = mx.sym.Dropout(network, p=0.5) | |
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred') | |
network = mx.sym.Activation(network, act_type='sigmoid') | |
#network = mx.sym.SoftmaxOutput(data=network, label=labels, name='output') | |
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss') | |
return network | |
def load_model(self, rank=0): | |
if 'load_epoch' not in self or self.load_epoch is None: | |
return (None, None, None) | |
assert self.model_prefix is not None | |
model_prefix = self.model_prefix | |
if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)): | |
model_prefix += "-%d" % (rank) | |
sym, arg_params, aux_params = mx.model.load_checkpoint( | |
model_prefix, self.load_epoch) | |
logging.info('Loaded model %s_%04d.params', model_prefix, self.load_epoch) | |
return (sym, arg_params, aux_params) | |
def save_model(self, rank=0): | |
if self.checkpoint is None: | |
return None | |
dst_dir = self.checkpoint | |
model_prefix = os.path.join(dst_dir, self.model_name) | |
if not os.path.isdir(dst_dir): | |
os.mkdir(dst_dir) | |
return mx.callback.do_checkpoint(model_prefix if rank == 0 else "%s-%d" % ( | |
model_prefix, rank)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mxnet as mx | |
import numpy as np | |
import os | |
import time | |
from sklearn.model_selection import train_test_split | |
from symbols import Network | |
from common import get_data_iter, precision, recall, fbeta_score | |
class ParamHolder(): | |
def __init__(self, params): | |
for key, val in params.iteritems(): | |
setattr(self, key, val) | |
# load train and valid data | |
def load_train_val(): | |
master_dir = '../data/' | |
x_train = np.load(os.path.join(master_dir, 'x_train_save.npy')) | |
tag_train = np.load(os.path.join(master_dir, 'tag_train_save.npy')) | |
y_train = np.load(os.path.join(master_dir, 'y_train_save.npy')) | |
x_valid = np.load(os.path.join(master_dir, 'x_valid_save.npy')) | |
tag_valid = np.load(os.path.join(master_dir, 'tag_valid_save.npy')) | |
y_valid = np.load(os.path.join(master_dir, 'y_valid_save.npy')) | |
return x_train, tag_train, y_train, x_valid, tag_valid, y_valid | |
# load and prepare data | |
def prepare_data(args, maxlen): | |
x_test = np.load(args.x_test_path)[:, -maxlen:] | |
tags_test = np.load(args.tag_test_path)[:, -maxlen:] | |
y_test = np.load(args.y_test_path)[:, -maxlen:] | |
# train/valid split or load data | |
# saved data maxlen = 400 | |
# x_train, x_valid, tag_train, tag_valid, y_train, y_valid = train_test_split(x_train, tags_train, y_train, test_size=0.2, random_state=2017) | |
x_train, tag_train, y_train, x_valid, tag_valid, y_valid = load_train_val() | |
return x_train, tag_train, y_train, x_valid, tag_valid, y_valid, x_test, tags_test, y_test | |
# define data file paths | |
check_dir = '../checkpoints/%s' | |
# model params | |
model_params = { | |
'nb_classes' : 20, | |
'max_features' : 342786, | |
'embedding_dims' : 256, | |
'maxlen' : 400, | |
'batch_size' : 256, | |
'filters': 256, | |
'nb_hidden': 32, | |
'kernel_size' : 3, | |
'nb_tags': 11, | |
'num_layers': 1, | |
'checkpoint': '../checkpoints/%s' % time.strftime('%Y%m%d'), | |
'model_name': 'fast_text_lstm', | |
} | |
# train params | |
train_params = { | |
'epoch': 40, | |
'batch_size': 256, | |
'x_train_path': '../data/x_train.npy', | |
'tag_train_path': '../data/x_train_tags.npy', | |
'y_train_path' : '../data/y_train.npy', | |
'x_test_path' : '../data/x_test.npy', | |
'tag_test_path' : '../data/x_test_tags.npy', | |
'y_test_path' : '../data/y_test.npy', | |
'check_dir' : '../checkpoints/%s', | |
'model_name' : 'fast_text_lstm', | |
'gpus': '0', | |
'lr_factor': 0.1, | |
'num_examples': 0, | |
'load_epoch': 0, | |
'kv_store': '', | |
'lr_step_epochs': '20, 40', | |
'lr': 0.01, | |
'mom': 0.9, | |
'wd': 0.0001, | |
'monitor': 0, | |
'optimizer': 'adam', | |
} | |
# get mxnet lr scheduler | |
def get_lr_scheduler(args, kv): | |
if args.lr_factor != None or args.lr_factor >= 1: | |
return (args.lr, None) | |
epoch_size = args.num_examples / args.batch_size | |
if 'dist' in args.kv_store: | |
epoch_size /= kv.num_workers | |
begin_epoch = args.load_epoch if args.load_epoch else 0 | |
step_epochs = [int(l) for l in args.lr_step_epochs.split(',')] | |
lr = args.lr | |
for s in step_epochs: | |
if begin_epoch >= s: | |
lr *= args.lr_factor | |
if lr != args.lr: | |
logging.info('Adjust learning rate to %e for epoch %d' %(lr, begin_epoch)) | |
steps = [epoch_size * (x-begin_epoch) for x in step_epochs if x-begin_epoch > 0] | |
return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor)) | |
# main train function | |
def train(): | |
# load data | |
mx.random.seed(2017) | |
args = ParamHolder(train_params) | |
x_train, tag_train, y_train, x_valid, tag_valid, y_valid, x_test, tag_test, y_test = prepare_data(args, model_params['maxlen']) | |
train_params['num_examples'] = len(x_train) # setup num_example param after loading data | |
args.num_examples = len(x_train) | |
# make data iterator for adopting mxnet training process | |
train_data_iter = get_data_iter(x_train, tag_train, y_train, batch_size=model_params['batch_size'], shuffle=True) | |
valid_data_iter = get_data_iter(x_valid, tag_valid, y_valid, batch_size=model_params['batch_size']) | |
# make kv store | |
kv = mx.kv.create('local') | |
# initialize model inst | |
model_set = Network(model_params) | |
# checkpoint | |
checkpoint = model_set.save_model(kv.rank) | |
# devices for training | |
devs = mx.cpu() if args.gpus is None or args.gpus is '' else [ | |
mx.gpu(int(i)) for i in args.gpus.split(',')] | |
# learning rate | |
lr, lr_scheduler = get_lr_scheduler(args, kv) | |
# load symbol | |
network = model_set.get_model(args.model_name) | |
# make module (model in keras) | |
model = mx.mod.Module(context=devs, symbol=network, data_names=['data', 'tags'], label_names=['labels']) | |
init = mx.initializer.Mixed(['bias', '.*'], [mx.init.Zero(), mx.init.Uniform(0.1)]) | |
# prepare optimizer paramse | |
optimizer_params = { | |
'learning_rate': lr, | |
'wd' : args.wd, | |
'lr_scheduler': lr_scheduler} | |
# monitoring parameters of network | |
monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None | |
# evaluation metrices using custom function in other source code | |
metric_f05 = lambda y_true, y_pred: fbeta_score(y_true, y_pred, beta=0.5) | |
p, r = map(mx.metric.create, [precision, recall]) | |
metric_f05 = mx.metric.create(metric_f05) | |
eval_metrics = [p, r, metric_f05] | |
# callbacks that run after each batch | |
batch_end_callback = mx.callback.ProgressBar(np.ceil(float(args.num_examples)/args.batch_size)) | |
batch_end_callback = mx.callback.Speedometer(args.batch_size, 50) | |
# setup logger | |
import logging | |
head = '%(asctime)-15s %(message)s' | |
logging.basicConfig(level=logging.DEBUG, format=head) | |
# plot image of network structure | |
mx.viz.plot_network(network).view() | |
# call fit function | |
model.fit(train_data_iter, | |
begin_epoch = args.load_epoch if args.load_epoch else 0, | |
num_epoch = args.epoch, | |
eval_data = valid_data_iter, | |
eval_metric = eval_metrics, | |
kvstore = kv, | |
optimizer = args.optimizer, | |
optimizer_params = optimizer_params, | |
initializer = init, | |
batch_end_callback = batch_end_callback, | |
epoch_end_callback = checkpoint, | |
allow_missing = True, | |
monitor = monitor) | |
# end | |
print('training is completed!!!!!!') | |
if __name__=='__main__': | |
train() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment