Last active
June 6, 2016 23:21
-
-
Save FlorianMuellerklein/535dfde68561c3a59e775fe9ef7e2d0c to your computer and use it in GitHub Desktop.
Potential Example for Lasagne/Recipes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Lasagne example of ResNet-110 from 'Identity Mappings in Deep Residual Networks', Kaiming He et al. 2016 (https://arxiv.org/abs/1603.05027) | |
""" | |
from __future__ import print_function | |
import sys | |
import os | |
import time | |
import string | |
import random | |
import pickle | |
import numpy as np | |
import theano | |
import theano.tensor as T | |
import lasagne | |
# for the larger networks (n>=9), we need to adjust pythons recursion limit | |
sys.setrecursionlimit(10000) | |
# ##################### Load data from CIFAR-10 dataset ####################### | |
# this code assumes the cifar dataset from 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' | |
# has been extracted in current working directory | |
def unpickle(file): | |
import cPickle | |
fo = open(file, 'rb') | |
dict = cPickle.load(fo) | |
fo.close() | |
return dict | |
def load_data(): | |
xs = [] | |
ys = [] | |
for j in range(5): | |
d = unpickle('cifar-10-batches-py/data_batch_'+`j+1`) | |
x = d['data'] | |
y = d['labels'] | |
xs.append(x) | |
ys.append(y) | |
d = unpickle('cifar-10-batches-py/test_batch') | |
xs.append(d['data']) | |
ys.append(d['labels']) | |
x = np.concatenate(xs)/np.float32(255) | |
y = np.concatenate(ys) | |
x = np.dstack((x[:, :1024], x[:, 1024:2048], x[:, 2048:])) | |
x = x.reshape((x.shape[0], 32, 32, 3)).transpose(0,3,1,2) | |
# subtract per-pixel mean | |
pixel_mean = np.mean(x[0:50000],axis=0) | |
#pickle.dump(pixel_mean, open("cifar10-pixel_mean.pkl","wb")) | |
x -= pixel_mean | |
# create mirrored images | |
X_train = x[0:50000,:,:,:] | |
Y_train = y[0:50000] | |
X_train_flip = X_train[:,:,:,::-1] | |
Y_train_flip = Y_train | |
X_train = np.concatenate((X_train,X_train_flip),axis=0) | |
Y_train = np.concatenate((Y_train,Y_train_flip),axis=0) | |
X_test = x[50000:,:,:,:] | |
Y_test = y[50000:] | |
return dict( | |
X_train=lasagne.utils.floatX(X_train), | |
Y_train=Y_train.astype('int32'), | |
X_test = lasagne.utils.floatX(X_test), | |
Y_test = Y_test.astype('int32'),) | |
# ##################### Build the neural network model ####################### | |
from lasagne.nonlinearities import rectify, softmax, sigmoid | |
from lasagne.layers import InputLayer, MaxPool2DLayer, DenseLayer, DropoutLayer, helper, batch_norm, BatchNormLayer | |
from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer | |
from lasagne.layers import Pool2DLayer, ElemwiseSumLayer, NonlinearityLayer, PadLayer, GlobalPoolLayer, ExpressionLayer | |
from lasagne.init import Orthogonal, HeNormal, GlorotNormal | |
PIXELS = 32 | |
imageSize = PIXELS * PIXELS | |
num_features = imageSize * 3 | |
he_norm = HeNormal(gain='relu') | |
def ResNet_FullPreActivation(input_var=None, n=18): | |
''' | |
Adapted from https://github.com/Lasagne/Recipes/tree/master/papers/deep_residual_learning. | |
Tweaked to be consistent with 'Identity Mappings in Deep Residual Networks', Kaiming He et al. 2016 (https://arxiv.org/abs/1603.05027) | |
Forumala to figure out depth: 6n + 2 | |
''' | |
# create a residual learning building block with two stacked 3x3 convlayers as in paper | |
def residual_block(l, increase_dim=False, first=False): | |
input_num_filters = l.output_shape[1] | |
if increase_dim: | |
first_stride = (2,2) | |
out_num_filters = input_num_filters*2 | |
else: | |
first_stride = (1,1) | |
out_num_filters = input_num_filters | |
if first: | |
# hacky solution to keep layers correct | |
bn_pre_relu = l | |
else: | |
# contains the BN -> ReLU portion, steps 1 to 2 | |
bn_pre_conv = BatchNormLayer(l) | |
bn_pre_relu = NonlinearityLayer(bn_pre_conv, rectify) | |
# contains the weight -> BN -> ReLU portion, steps 3 to 5 | |
conv_1 = batch_norm(ConvLayer(bn_pre_relu, num_filters=out_num_filters, filter_size=(3,3), stride=first_stride, nonlinearity=rectify, pad='same', W=he_norm)) | |
# contains the last weight portion, step 6 | |
conv_2 = ConvLayer(conv_1, num_filters=out_num_filters, filter_size=(3,3), stride=(1,1), nonlinearity=None, pad='same', W=he_norm) | |
# add shortcut connections | |
if increase_dim: | |
# projection shortcut, as option B in paper | |
projection = ConvLayer(l, num_filters=out_num_filters, filter_size=(1,1), stride=(2,2), nonlinearity=None, pad='same', b=None) | |
block = ElemwiseSumLayer([conv_2, projection]) | |
else: | |
block = ElemwiseSumLayer([conv_2, l]) | |
return block | |
# Building the network | |
l_in = InputLayer(shape=(None, 3, PIXELS, PIXELS), input_var=input_var) | |
# first layer, output is 16 x 32 x 32 | |
l = batch_norm(ConvLayer(l_in, num_filters=16, filter_size=(3,3), stride=(1,1), nonlinearity=rectify, pad='same', W=he_norm)) | |
# first stack of residual blocks, output is 16 x 32 x 32 | |
l = residual_block(l, first=True) | |
for _ in range(1,n): | |
l = residual_block(l) | |
# second stack of residual blocks, output is 32 x 16 x 16 | |
l = residual_block(l, increase_dim=True) | |
for _ in range(1,n): | |
l = residual_block(l) | |
# third stack of residual blocks, output is 64 x 8 x 8 | |
l = residual_block(l, increase_dim=True) | |
for _ in range(1,n): | |
l = residual_block(l) | |
bn_post_conv = BatchNormLayer(l) | |
bn_post_relu = NonlinearityLayer(bn_post_conv, rectify) | |
# average pooling | |
avg_pool = GlobalPoolLayer(bn_post_relu) | |
# fully connected layer | |
network = DenseLayer(avg_pool, num_units=10, W=HeNormal(), nonlinearity=softmax) | |
return network | |
# ############################# Batch iterator ############################### | |
def iterate_minibatches(inputs, targets, batchsize, shuffle=False, augment=False): | |
assert len(inputs) == len(targets) | |
if shuffle: | |
indices = np.arange(len(inputs)) | |
np.random.shuffle(indices) | |
for start_idx in range(0, len(inputs) - batchsize + 1, batchsize): | |
if shuffle: | |
excerpt = indices[start_idx:start_idx + batchsize] | |
else: | |
excerpt = slice(start_idx, start_idx + batchsize) | |
if augment: | |
# as in paper : | |
# pad feature arrays with 4 pixels on each side | |
# and do random cropping of 32x32 | |
padded = np.pad(inputs[excerpt],((0,0),(0,0),(4,4),(4,4)),mode='constant') | |
random_cropped = np.zeros(inputs[excerpt].shape, dtype=np.float32) | |
crops = np.random.random_integers(0,high=8,size=(batchsize,2)) | |
for r in range(batchsize): | |
random_cropped[r,:,:,:] = padded[r,:,crops[r,0]:(crops[r,0]+32),crops[r,1]:(crops[r,1]+32)] | |
inp_exc = random_cropped | |
else: | |
inp_exc = inputs[excerpt] | |
yield inp_exc, targets[excerpt] | |
# ############################## Main program ################################ | |
def main(n=5, num_epochs=82, model=None): | |
# Check if cifar data exists | |
if not os.path.exists("./cifar-10-batches-py"): | |
print("CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'.") | |
return | |
# Load the dataset | |
print("Loading data...") | |
data = load_data() | |
X_train = data['X_train'] | |
Y_train = data['Y_train'] | |
X_test = data['X_test'] | |
Y_test = data['Y_test'] | |
# Prepare Theano variables for inputs and targets | |
input_var = T.tensor4('inputs') | |
target_var = T.ivector('targets') | |
# Create neural network model | |
print("Building model and compiling functions...") | |
network = ResNet_FullPreActivation(input_var, n=18) | |
print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True)) | |
if model is None: | |
# Create a loss expression for training, i.e., a scalar objective we want | |
# to minimize (for our multi-class problem, it is the cross-entropy loss): | |
prediction = lasagne.layers.get_output(network) | |
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) | |
loss = loss.mean() | |
# add weight decay | |
all_layers = lasagne.layers.get_all_layers(network) | |
l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * 0.0001 | |
loss = loss + l2_penalty | |
# Create update expressions for training | |
# Stochastic Gradient Descent (SGD) with momentum | |
params = lasagne.layers.get_all_params(network, trainable=True) | |
lr = 0.1 | |
sh_lr = theano.shared(lasagne.utils.floatX(lr)) | |
updates = lasagne.updates.momentum( | |
loss, params, learning_rate=sh_lr, momentum=0.9) | |
# Compile a function performing a training step on a mini-batch (by giving | |
# the updates dictionary) and returning the corresponding training loss: | |
train_fn = theano.function([input_var, target_var], loss, updates=updates) | |
# Create a loss expression for validation/testing | |
test_prediction = lasagne.layers.get_output(network, deterministic=True) | |
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, | |
target_var) | |
test_loss = test_loss.mean() | |
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), | |
dtype=theano.config.floatX) | |
# Compile a second function computing the validation loss and accuracy: | |
val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) | |
if model is None: | |
# launch the training loop | |
print("Starting training...") | |
# We iterate over epochs: | |
for epoch in range(num_epochs): | |
# shuffle training data | |
train_indices = np.arange(100000) | |
np.random.shuffle(train_indices) | |
X_train = X_train[train_indices,:,:,:] | |
Y_train = Y_train[train_indices] | |
# In each epoch, we do a full pass over the training data: | |
train_err = 0 | |
train_batches = 0 | |
start_time = time.time() | |
for batch in iterate_minibatches(X_train, Y_train, 128, shuffle=True, augment=True): | |
inputs, targets = batch | |
train_err += train_fn(inputs, targets) | |
train_batches += 1 | |
# And a full pass over the validation data: | |
val_err = 0 | |
val_acc = 0 | |
val_batches = 0 | |
for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): | |
inputs, targets = batch | |
err, acc = val_fn(inputs, targets) | |
val_err += err | |
val_acc += acc | |
val_batches += 1 | |
# Then we print the results for this epoch: | |
print("Epoch {} of {} took {:.3f}s".format( | |
epoch + 1, num_epochs, time.time() - start_time)) | |
print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) | |
print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) | |
print(" validation accuracy:\t\t{:.2f} %".format( | |
val_acc / val_batches * 100)) | |
# adjust learning rate as in paper | |
# 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs | |
if (epoch+1) == 41 or (epoch+1) == 61: | |
new_lr = sh_lr.get_value() * 0.1 | |
print("New LR:"+str(new_lr)) | |
sh_lr.set_value(lasagne.utils.floatX(new_lr)) | |
# dump the network weights to a file : | |
np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network)) | |
else: | |
# load network weights from model file | |
with np.load(model) as f: | |
param_values = [f['arr_%d' % i] for i in range(len(f.files))] | |
lasagne.layers.set_all_param_values(network, param_values) | |
# Calculate validation error of model: | |
test_err = 0 | |
test_acc = 0 | |
test_batches = 0 | |
for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): | |
inputs, targets = batch | |
err, acc = val_fn(inputs, targets) | |
test_err += err | |
test_acc += acc | |
test_batches += 1 | |
print("Final results:") | |
print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) | |
print(" test accuracy:\t\t{:.2f} %".format( | |
test_acc / test_batches * 100)) | |
if __name__ == '__main__': | |
if ('--help' in sys.argv) or ('-h' in sys.argv): | |
print("Trains a Deep Residual Learning network on cifar-10 using Lasagne.") | |
print("Network architecture and training parameters are as in section 4.2 in 'Deep Residual Learning for Image Recognition'.") | |
print("Usage: %s [N [MODEL]]" % sys.argv[0]) | |
print() | |
print("N: Number of stacked residual building blocks per feature map (default: 5)") | |
print("MODEL: saved model file to load (for validation) (default: None)") | |
else: | |
kwargs = {} | |
if len(sys.argv) > 1: | |
kwargs['n'] = int(sys.argv[1]) | |
if len(sys.argv) > 2: | |
kwargs['model'] = sys.argv[2] | |
main(**kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment