-
-
Save danieltomasz/eb1ae246c05723e2e072 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A deep neural network with or w/o dropout in one file. | |
""" | |
import numpy, theano, sys, math | |
from theano import tensor as T | |
from theano import shared | |
from theano.tensor.shared_randomstreams import RandomStreams | |
from collections import OrderedDict | |
BATCH_SIZE = 100 | |
def relu_f(vec): | |
""" Wrapper to quickly change the rectified linear unit function """ | |
return (vec + abs(vec)) / 2. | |
def dropout(rng, x, p=0.5): | |
""" Zero-out random values in x with probability p using rng """ | |
if p > 0. and p < 1.: | |
seed = rng.randint(2 ** 30) | |
srng = theano.tensor.shared_randomstreams.RandomStreams(seed) | |
mask = srng.binomial(n=1, p=1.-p, size=x.shape, | |
dtype=theano.config.floatX) | |
return x * mask | |
return x | |
def fast_dropout(rng, x): | |
""" Multiply activations by N(1,1) """ | |
seed = rng.randint(2 ** 30) | |
srng = RandomStreams(seed) | |
mask = srng.normal(size=x.shape, avg=1., dtype=theano.config.floatX) | |
return x * mask | |
def build_shared_zeros(shape, name): | |
""" Builds a theano shared variable filled with a zeros numpy array """ | |
return shared(value=numpy.zeros(shape, dtype=theano.config.floatX), | |
name=name, borrow=True) | |
class Linear(object): | |
""" Basic linear transformation layer (W.X + b) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
if W is None: | |
W_values = numpy.asarray(rng.uniform( | |
low=-numpy.sqrt(6. / (n_in + n_out)), | |
high=numpy.sqrt(6. / (n_in + n_out)), | |
size=(n_in, n_out)), dtype=theano.config.floatX) | |
W_values *= 4 # This works for sigmoid activated networks! | |
W = theano.shared(value=W_values, name='W', borrow=True) | |
if b is None: | |
b = build_shared_zeros((n_out,), 'b') | |
self.input = input | |
self.W = W | |
self.b = b | |
self.params = [self.W, self.b] | |
self.output = T.dot(self.input, self.W) + self.b | |
if fdrop: | |
self.output = fast_dropout(rng, self.output) | |
def __repr__(self): | |
return "Linear" | |
class SigmoidLayer(Linear): | |
""" Sigmoid activation layer (sigmoid(W.X + b)) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b) | |
self.pre_activation = self.output | |
if fdrop: | |
self.pre_activation = fast_dropout(rng, self.pre_activation) | |
self.output = T.nnet.sigmoid(self.pre_activation) | |
class ReLU(Linear): | |
""" Rectified Linear Unit activation layer (max(0, W.X + b)) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
if b is None: | |
b = build_shared_zeros((n_out,), 'b') | |
super(ReLU, self).__init__(rng, input, n_in, n_out, W, b) | |
self.pre_activation = self.output | |
if fdrop: | |
self.pre_activation = fast_dropout(rng, self.pre_activation) | |
self.output = relu_f(self.pre_activation) | |
class DatasetMiniBatchIterator(object): | |
""" Basic mini-batch iterator """ | |
def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False): | |
self.x = x | |
self.y = y | |
self.batch_size = batch_size | |
self.randomize = randomize | |
from sklearn.utils import check_random_state | |
self.rng = check_random_state(42) | |
def __iter__(self): | |
n_samples = self.x.shape[0] | |
if self.randomize: | |
for _ in xrange(n_samples / BATCH_SIZE): | |
if BATCH_SIZE > 1: | |
i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE)) | |
else: | |
i = int(math.floor(self.rng.rand(1) * n_samples)) | |
yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size], | |
self.y[i*self.batch_size:(i+1)*self.batch_size]) | |
else: | |
for i in xrange((n_samples + self.batch_size - 1) | |
/ self.batch_size): | |
yield (self.x[i*self.batch_size:(i+1)*self.batch_size], | |
self.y[i*self.batch_size:(i+1)*self.batch_size]) | |
class LogisticRegression: | |
"""Multi-class Logistic Regression | |
""" | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None): | |
if W != None: | |
self.W = W | |
else: | |
self.W = build_shared_zeros((n_in, n_out), 'W') | |
if b != None: | |
self.b = b | |
else: | |
self.b = build_shared_zeros((n_out,), 'b') | |
# P(Y|X) = softmax(W.X + b) | |
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) | |
self.y_pred = T.argmax(self.p_y_given_x, axis=1) | |
self.output = self.y_pred | |
self.params = [self.W, self.b] | |
def negative_log_likelihood(self, y): | |
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) | |
def negative_log_likelihood_sum(self, y): | |
return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) | |
def training_cost(self, y): | |
""" Wrapper for standard name """ | |
return self.negative_log_likelihood_sum(y) | |
def errors(self, y): | |
if y.ndim != self.y_pred.ndim: | |
raise TypeError("y should have the same shape as self.y_pred", | |
("y", y.type, "y_pred", self.y_pred.type)) | |
if y.dtype.startswith('int'): | |
return T.mean(T.neq(self.y_pred, y)) | |
else: | |
print("!!! y should be of int type") | |
return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int'))) | |
class NeuralNet(object): | |
""" Neural network (not regularized, without dropout) """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=40*3, | |
layers_types=[Linear, ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[1024, 1024, 1024, 1024], | |
n_outs=62 * 3, | |
rho=0.9, | |
eps=1.E-6, | |
max_norm=0., | |
debugprint=False): | |
""" | |
Basic feedforward neural network. | |
""" | |
self.layers = [] | |
self.params = [] | |
self.n_layers = len(layers_types) | |
self.layers_types = layers_types | |
assert self.n_layers > 0 | |
self.max_norm = max_norm | |
self._rho = rho # "momentum" for adadelta | |
self._eps = eps # epsilon for adadelta | |
self._accugrads = [] # for adadelta | |
self._accudeltas = [] # for adadelta | |
if theano_rng == None: | |
theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) | |
self.x = T.fmatrix('x') | |
self.y = T.ivector('y') | |
self.layers_ins = [n_ins] + layers_sizes | |
self.layers_outs = layers_sizes + [n_outs] | |
layer_input = self.x | |
for layer_type, n_in, n_out in zip(layers_types, | |
self.layers_ins, self.layers_outs): | |
this_layer = layer_type(rng=numpy_rng, | |
input=layer_input, n_in=n_in, n_out=n_out) | |
assert hasattr(this_layer, 'output') | |
self.params.extend(this_layer.params) | |
self._accugrads.extend([build_shared_zeros(t.shape.eval(), | |
'accugrad') for t in this_layer.params]) | |
self._accudeltas.extend([build_shared_zeros(t.shape.eval(), | |
'accudelta') for t in this_layer.params]) | |
self.layers.append(this_layer) | |
layer_input = this_layer.output | |
assert hasattr(self.layers[-1], 'training_cost') | |
assert hasattr(self.layers[-1], 'errors') | |
# TODO standardize cost | |
self.mean_cost = self.layers[-1].negative_log_likelihood(self.y) | |
self.cost = self.layers[-1].training_cost(self.y) | |
if debugprint: | |
theano.printing.debugprint(self.cost) | |
self.errors = self.layers[-1].errors(self.y) | |
def __repr__(self): | |
dimensions_layers_str = map(lambda x: "x".join(map(str, x)), | |
zip(self.layers_ins, self.layers_outs)) | |
return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])), | |
zip(self.layers_types, dimensions_layers_str))) | |
def get_SGD_trainer(self): | |
""" Returns a plain SGD minibatch trainer with learning rate as param. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
learning_rate = T.fscalar('lr') # learning rate to use | |
# compute the gradients with respect to the model parameters | |
# using mean_cost so that the learning rate is not too dependent | |
# on the batch size | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for param, gparam in zip(self.params, gparams): | |
if self.max_norm: | |
W = param - gparam * learning_rate | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param - gparam * learning_rate | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y), | |
theano.Param(learning_rate)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_adagrad_trainer(self): | |
""" Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
learning_rate = T.fscalar('lr') # learning rate to use | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = accugrad + gparam * gparam | |
dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam | |
if self.max_norm: | |
W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y), | |
theano.Param(learning_rate)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_adadelta_trainer(self): | |
""" Returns an Adadelta (Zeiler 2012) trainer using self._rho and | |
self._eps params. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, accudelta, param, gparam in zip(self._accugrads, | |
self._accudeltas, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam | |
dx = - T.sqrt((accudelta + self._eps) | |
/ (agrad + self._eps)) * gparam | |
updates[accudelta] = (self._rho * accudelta | |
+ (1 - self._rho) * dx * dx) | |
if self.max_norm: | |
W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def score_classif(self, given_set): | |
""" Returns functions to get current classification errors. """ | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
score = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.errors, | |
givens={self.x: batch_x, self.y: batch_y}) | |
def scoref(): | |
""" returned function that scans the entire set given as input """ | |
return [score(batch_x, batch_y) for batch_x, batch_y in given_set] | |
return scoref | |
class RegularizedNet(NeuralNet): | |
""" Neural net with L1 and L2 regularization """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=100, | |
layers_types=[ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[1024, 1024, 1024], | |
n_outs=2, | |
rho=0.9, | |
eps=1.E-6, | |
L1_reg=0., | |
L2_reg=0., | |
max_norm=0., | |
debugprint=False): | |
""" | |
Feedforward neural network with added L1 and/or L2 regularization. | |
""" | |
super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins, | |
layers_types, layers_sizes, n_outs, rho, eps, max_norm, | |
debugprint) | |
L1 = shared(0.) | |
for param in self.params: | |
L1 += T.sum(abs(param)) | |
if L1_reg > 0.: | |
self.cost = self.cost + L1_reg * L1 | |
L2 = shared(0.) | |
for param in self.params: | |
L2 += T.sum(param ** 2) | |
if L2_reg > 0.: | |
self.cost = self.cost + L2_reg * L2 | |
class DropoutNet(NeuralNet): | |
""" Neural net with dropout (see Hinton's et al. paper) """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=40*3, | |
layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[4000, 4000, 4000, 4000], | |
dropout_rates=[0.0, 0.5, 0.5, 0.5, 0.5], | |
n_outs=62 * 3, | |
rho=0.9, | |
eps=1.E-6, | |
max_norm=0., | |
fast_drop=False, | |
debugprint=False): | |
""" | |
Feedforward neural network with dropout regularization. | |
""" | |
super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins, | |
layers_types, layers_sizes, n_outs, rho, eps, max_norm, | |
debugprint) | |
self.dropout_rates = dropout_rates | |
if fast_drop: | |
if dropout_rates[0]: | |
dropout_layer_input = fast_dropout(numpy_rng, self.x) | |
else: | |
dropout_layer_input = self.x | |
else: | |
dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0]) | |
self.dropout_layers = [] | |
for layer, layer_type, n_in, n_out, dr in zip(self.layers, | |
layers_types, self.layers_ins, self.layers_outs, | |
dropout_rates[1:] + [0]): # !!! we do not dropout anything | |
# from the last layer !!! | |
if dr: | |
if fast_drop: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W, b=layer.b, fdrop=True) | |
else: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W * 1. / (1. - dr), | |
b=layer.b * 1. / (1. - dr)) | |
# N.B. dropout with dr==1 does not dropanything!! | |
this_layer.output = dropout(numpy_rng, this_layer.output, dr) | |
else: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W, b=layer.b) | |
assert hasattr(this_layer, 'output') | |
self.dropout_layers.append(this_layer) | |
dropout_layer_input = this_layer.output | |
assert hasattr(self.layers[-1], 'training_cost') | |
assert hasattr(self.layers[-1], 'errors') | |
# these are the dropout costs | |
self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y) | |
self.cost = self.dropout_layers[-1].training_cost(self.y) | |
# these is the non-dropout errors | |
self.errors = self.layers[-1].errors(self.y) | |
def __repr__(self): | |
return super(DropoutNet, self).__repr__() + "\n"\ | |
+ "dropout rates: " + str(self.dropout_rates) | |
def add_fit_and_score(class_to_chg): | |
""" Mutates a class to add the fit() and score() functions to a NeuralNet. | |
""" | |
from types import MethodType | |
def fit(self, x_train, y_train, x_dev=None, y_dev=None, | |
max_epochs=100, early_stopping=True, split_ratio=0.1, | |
method='adadelta', verbose=False, plot=False): | |
""" | |
Fits the neural network to `x_train` and `y_train`. | |
If x_dev nor y_dev are not given, it will do a `split_ratio` cross- | |
validation split on `x_train` and `y_train` (for early stopping). | |
""" | |
import time, copy | |
if x_dev == None or y_dev == None: | |
from sklearn.cross_validation import train_test_split | |
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, | |
test_size=split_ratio, random_state=42) | |
if method == 'sgd': | |
train_fn = self.get_SGD_trainer() | |
elif method == 'adagrad': | |
train_fn = self.get_adagrad_trainer() | |
elif method == 'adadelta': | |
train_fn = self.get_adadelta_trainer() | |
train_set_iterator = DatasetMiniBatchIterator(x_train, y_train) | |
dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev) | |
train_scoref = self.score_classif(train_set_iterator) | |
dev_scoref = self.score_classif(dev_set_iterator) | |
best_dev_loss = numpy.inf | |
epoch = 0 | |
# TODO early stopping (not just cross val, also stop training) | |
if plot: | |
verbose = True | |
self._costs = [] | |
self._train_errors = [] | |
self._dev_errors = [] | |
self._updates = [] | |
while epoch < max_epochs: | |
if not verbose: | |
sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs)) | |
sys.stdout.flush() | |
avg_costs = [] | |
timer = time.time() | |
for x, y in train_set_iterator: | |
if method == 'sgd' or method == 'adagrad': | |
avg_cost = train_fn(x, y, lr=1.E-2) # TODO: you have to | |
# play with this | |
# learning rate | |
# (dataset dependent) | |
elif method == 'adadelta': | |
avg_cost = train_fn(x, y) | |
if type(avg_cost) == list: | |
avg_costs.append(avg_cost[0]) | |
else: | |
avg_costs.append(avg_cost) | |
if verbose: | |
mean_costs = numpy.mean(avg_costs) | |
mean_train_errors = numpy.mean(train_scoref()) | |
print(' epoch %i took %f seconds' % | |
(epoch, time.time() - timer)) | |
print(' epoch %i, avg costs %f' % | |
(epoch, mean_costs)) | |
print(' epoch %i, training error %f' % | |
(epoch, mean_train_errors)) | |
if plot: | |
self._costs.append(mean_costs) | |
self._train_errors.append(mean_train_errors) | |
dev_errors = numpy.mean(dev_scoref()) | |
if plot: | |
self._dev_errors.append(dev_errors) | |
if dev_errors < best_dev_loss: | |
best_dev_loss = dev_errors | |
best_params = copy.deepcopy(self.params) | |
if verbose: | |
print('!!! epoch %i, validation error of best model %f' % | |
(epoch, dev_errors)) | |
epoch += 1 | |
if not verbose: | |
print("") | |
for i, param in enumerate(best_params): | |
self.params[i] = param | |
def score(self, x, y): | |
""" error rates """ | |
iterator = DatasetMiniBatchIterator(x, y) | |
scoref = self.score_classif(iterator) | |
return numpy.mean(scoref()) | |
class_to_chg.fit = MethodType(fit, None, class_to_chg) | |
class_to_chg.score = MethodType(score, None, class_to_chg) | |
if __name__ == "__main__": | |
add_fit_and_score(DropoutNet) | |
add_fit_and_score(RegularizedNet) | |
def nudge_dataset(X, Y): | |
""" | |
This produces a dataset 5 times bigger than the original one, | |
by moving the 8x8 images in X around by 1px to left, right, down, up | |
""" | |
from scipy.ndimage import convolve | |
direction_vectors = [ | |
[[0, 1, 0], | |
[0, 0, 0], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[1, 0, 0], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[0, 0, 1], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[0, 0, 0], | |
[0, 1, 0]]] | |
shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', | |
weights=w).ravel() | |
X = numpy.concatenate([X] + | |
[numpy.apply_along_axis(shift, 1, X, vector) | |
for vector in direction_vectors]) | |
Y = numpy.concatenate([Y for _ in range(5)], axis=0) | |
return X, Y | |
from sklearn import datasets, svm, naive_bayes | |
from sklearn import cross_validation, preprocessing | |
MNIST = True # MNIST dataset | |
DIGITS = False # digits dataset | |
FACES = True # faces dataset | |
TWENTYNEWSGROUPS = False # 20 newgroups dataset | |
VERBOSE = True # prints evolution of the loss/accuracy during the fitting | |
SCALE = True # scale the dataset | |
PLOT = True # plot losses and accuracies | |
def train_models(x_train, y_train, x_test, y_test, n_features, n_outs, | |
use_dropout=True, n_epochs=100, numpy_rng=None, | |
svms=False, nb=False, deepnn=True, name=''): | |
if svms: | |
print("Linear SVM") | |
classifier = svm.SVC(gamma=0.001) | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
print("RBF-kernel SVM") | |
classifier = svm.SVC(kernel='rbf', class_weight='auto') | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
if nb: | |
print("Multinomial Naive Bayes") | |
classifier = naive_bayes.MultinomialNB() | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
if deepnn: | |
import warnings | |
warnings.filterwarnings("ignore") # TODO remove | |
if use_dropout: | |
#n_epochs *= 4 TODO | |
pass | |
def new_dnn(dropout=False): | |
if dropout: | |
print("Dropout DNN") | |
return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features, | |
layers_types=[ReLU, ReLU, LogisticRegression], | |
layers_sizes=[200, 200], | |
dropout_rates=[0., 0.5, 0.5], | |
# TODO if you have a big enough GPU, use these: | |
#layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], | |
#layers_sizes=[2000, 2000, 2000, 2000], | |
#dropout_rates=[0., 0.5, 0.5, 0.5, 0.5], | |
n_outs=n_outs, | |
max_norm=4., | |
fast_drop=True, | |
debugprint=0) | |
else: | |
print("Simple (regularized) DNN") | |
return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features, | |
layers_types=[ReLU, ReLU, LogisticRegression], | |
layers_sizes=[200, 200], | |
n_outs=n_outs, | |
#L1_reg=0.001/x_train.shape[0], | |
#L2_reg=0.001/x_train.shape[0], | |
L1_reg=0., | |
L2_reg=1./x_train.shape[0], | |
debugprint=0) | |
import matplotlib.pyplot as plt | |
plt.figure() | |
ax1 = plt.subplot(221) | |
ax2 = plt.subplot(222) | |
ax3 = plt.subplot(223) | |
ax4 = plt.subplot(224) # TODO plot the updates of the weights | |
methods = ['sgd', 'adagrad', 'adadelta'] | |
#methods = ['adadelta'] TODO if you want "good" results asap | |
for method in methods: | |
dnn = new_dnn(use_dropout) | |
print dnn, "using", method | |
dnn.fit(x_train, y_train, max_epochs=n_epochs, method=method, verbose=VERBOSE, plot=PLOT) | |
test_error = dnn.score(x_test, y_test) | |
print("score: %f" % (1. - test_error)) | |
ax1.plot(numpy.log10(dnn._costs), label=method) | |
ax2.plot(numpy.log10(dnn._train_errors), label=method) | |
ax3.plot(numpy.log10(dnn._dev_errors), label=method) | |
#ax2.plot(dnn._train_errors, label=method) | |
#ax3.plot(dnn._dev_errors, label=method) | |
ax4.plot([test_error for _ in range(10)], label=method) | |
ax1.set_xlabel('epoch') | |
ax1.set_ylabel('cost (log10)') | |
ax2.set_xlabel('epoch') | |
ax2.set_ylabel('train error') | |
ax3.set_xlabel('epoch') | |
ax3.set_ylabel('dev error') | |
ax4.set_ylabel('test error') | |
plt.legend() | |
plt.savefig('training_' + name + '.png') | |
if MNIST: | |
from sklearn.datasets import fetch_mldata | |
mnist = fetch_mldata('MNIST original') | |
X = numpy.asarray(mnist.data, dtype='float32') | |
if SCALE: | |
#X = preprocessing.scale(X) | |
X /= 255. | |
y = numpy.asarray(mnist.target, dtype='int32') | |
print("Total dataset size:") | |
print("n samples: %d" % X.shape[0]) | |
print("n features: %d" % X.shape[1]) | |
print("n classes: %d" % len(set(y))) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
X, y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, X.shape[1], | |
len(set(y)), numpy_rng=numpy.random.RandomState(123), | |
name='MNIST') | |
if DIGITS: | |
digits = datasets.load_digits() | |
data = numpy.asarray(digits.data, dtype='float32') | |
target = numpy.asarray(digits.target, dtype='int32') | |
nudged_x, nudged_y = nudge_dataset(data, target) | |
if SCALE: | |
nudged_x = preprocessing.scale(nudged_x) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
nudged_x, nudged_y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, nudged_x.shape[1], | |
len(set(target)), numpy_rng=numpy.random.RandomState(123), | |
name='digits') | |
if FACES: | |
import logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s %(message)s') | |
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, | |
resize=0.4) | |
X = numpy.asarray(lfw_people.data, dtype='float32') | |
if SCALE: | |
X = preprocessing.scale(X) | |
y = numpy.asarray(lfw_people.target, dtype='int32') | |
target_names = lfw_people.target_names | |
print("Total dataset size:") | |
print("n samples: %d" % X.shape[0]) | |
print("n features: %d" % X.shape[1]) | |
print("n classes: %d" % target_names.shape[0]) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
X, y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, X.shape[1], | |
len(set(y)), numpy_rng=numpy.random.RandomState(123), | |
name='faces') | |
if TWENTYNEWSGROUPS: | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
newsgroups_train = datasets.fetch_20newsgroups(subset='train') | |
vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000) | |
#vectorizer = HashingVectorizer(encoding='latin-1') | |
x_train = vectorizer.fit_transform(newsgroups_train.data) | |
x_train = numpy.asarray(x_train.todense(), dtype='float32') | |
y_train = numpy.asarray(newsgroups_train.target, dtype='int32') | |
newsgroups_test = datasets.fetch_20newsgroups(subset='test') | |
x_test = vectorizer.transform(newsgroups_test.data) | |
x_test = numpy.asarray(x_test.todense(), dtype='float32') | |
y_test = numpy.asarray(newsgroups_test.target, dtype='int32') | |
train_models(x_train, y_train, x_test, y_test, x_train.shape[1], | |
len(set(y_train)), | |
numpy_rng=numpy.random.RandomState(123), | |
svms=False, nb=True, deepnn=True, | |
name='20newsgroups') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment