Last active
July 24, 2020 20:24
-
-
Save arnaldog12/6b01d74ae47f530cfcc87aca50d1a2e1 to your computer and use it in GitHub Desktop.
Manual Prático do Deep Learning - Rede Neural
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import _pickle as pkl | |
# activation functions | |
def linear(x, derivative=False): | |
return np.ones_like(x) if derivative else x | |
def sigmoid(x, derivative=False): | |
if derivative: | |
y = sigmoid(x) | |
return y*(1 - y) | |
return 1.0/(1.0 + np.exp(-x)) | |
def tanh(x, derivative=False): | |
if derivative: | |
y = tanh(x) | |
return 1 - y**2 | |
return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x)) | |
def relu(x, derivative=False): | |
if derivative: | |
return np.where(x <= 0, 0, 1) | |
return np.maximum(0, x) | |
def leaky_relu(x, derivative=False): | |
alpha = 0.1 | |
if derivative: | |
return np.where(x <= 0, alpha, 1) | |
return np.where(x <= 0, alpha*x, x) | |
def elu(x, derivative=False): | |
alpha = 1.0 | |
if derivative: | |
y = elu(x) | |
return np.where(x <= 0, y + alpha, 1) | |
return np.where(x <= 0, alpha*(np.exp(x) - 1), x) | |
# other functions | |
def softmax(x, y_oh=None, derivative=False): | |
if derivative: | |
y_pred = softmax(x) | |
y_correct = np.argmax(y_oh, axis=1) | |
pk = y_pred[range(y_pred.shape[0]), y_correct] | |
y_pred[range(y_pred.shape[0]), y_correct] = pk*(1.0 - pk) | |
return y_pred | |
exp = np.exp(x) | |
return exp/np.sum(exp, axis=1, keepdims=True) | |
def neg_log_likelihood(y_oh, y_pred, derivative=False): | |
y_correct = np.argmax(y_oh, axis=1) | |
pk = y_pred[range(y_pred.shape[0]), y_correct] | |
if derivative: | |
y_pred[range(y_pred.shape[0]), y_correct] = (-1.0/pk) | |
return y_pred | |
return np.mean(-np.log(pk)) | |
# cost functions | |
def mae(y, y_pred, derivative=False): | |
if derivative: | |
return np.where(y_pred > y, 1, -1) / y.shape[0] | |
return np.mean(np.abs(y - y_pred)) | |
def mse(y, y_pred, derivative=False): | |
if derivative: | |
return -(y - y_pred) / y.shape[0] | |
return 0.5*np.mean((y - y_pred)**2) | |
def binary_cross_entropy(y, y_pred, derivative=False): | |
if derivative: | |
return -(y - y_pred) / (y_pred * (1-y_pred) * y.shape[0]) | |
return -np.mean(y*np.log(y_pred) + (1-y)*np.log(1-y_pred)) | |
def sigmoid_cross_entropy(y, y_pred, derivative=False): | |
y_sigmoid = sigmoid(y_pred) | |
if derivative: | |
return -(y - y_sigmoid) / y.shape[0] | |
return -np.mean(y*np.log(y_sigmoid) + (1-y)*np.log(1-y_sigmoid)) | |
def softmax_neg_log_likelihood(y_oh, y_pred, derivative=False): | |
y_softmax = softmax(y_pred) | |
y_correct = np.argmax(y_oh, axis=1) | |
pk = y_softmax[range(y_softmax.shape[0]), y_correct] | |
if derivative: | |
return -(y_oh - y_softmax)/y_oh.shape[0] | |
return np.mean(-np.log(pk)) | |
# weights initialization | |
def zeros(rows, cols): | |
return np.zeros((rows, cols)) | |
def ones(rows, cols): | |
return np.ones((rows, cols)) | |
def random_normal(rows, cols): | |
return np.random.randn(rows, cols) | |
def random_uniform(rows, cols): | |
return np.random.rand(rows, cols) | |
def glorot_normal(rows, cols): | |
# normal com media=0 e stddev=sqrt(2.0 / (out + inp)). Ver notas de np.random.randn. | |
std_dev = np.sqrt(2.0 / (rows + cols)) | |
return std_dev*np.random.randn(rows, cols) | |
def glorot_uniform(rows, cols): | |
# uniforme de [-limit, limit], onde limit = np.sqrt(6.0 / (out + inp)) | |
limit = np.sqrt(6.0 / (rows + cols)) | |
return 2*limit*np.random.rand(rows, cols) - limit | |
# regularization | |
def l1_regularization(weights, derivative=False): | |
if derivative: | |
weights = [np.where(w < 0, -1, w) for w in weights] | |
return np.array([np.where(w > 0, 1, w) for w in weights]) | |
return np.sum([np.sum(np.abs(w)) for w in weights]) | |
def l2_regularization(weights, derivative=False): | |
if derivative: | |
return weights | |
return 0.5 * np.sum(weights**2) | |
# batch generator | |
def batch_sequential(x, y, batch_size=None): | |
batch_size = x.shape[0] if batch_size is None else batch_size | |
n_batches = x.shape[0] // batch_size | |
for batch in range(n_batches): | |
offset = batch_size * batch | |
x_batch, y_batch = x[offset:offset+batch_size], y[offset:offset+batch_size] | |
yield (x_batch, y_batch) | |
def batch_shuffle(x, y, batch_size=None): | |
shuffle_index = np.random.permutation(range(x.shape[0])) | |
return batch_sequential(x[shuffle_index], y[shuffle_index], batch_size) | |
# learning rate decay | |
def none_decay(learning_rate, epoch, decay_rate, decay_step=1): | |
return learning_rate | |
def time_based_decay(learning_rate, epoch, decay_rate, decay_steps=1): | |
return 1.0 / (1 + decay_rate * epoch) | |
def exponential_decay(learning_rate, epoch, decay_rate, decay_steps=1): | |
return learning_rate * decay_rate**epoch | |
def staircase_decay(learning_rate, epoch, decay_rate, decay_steps=1): | |
return learning_rate * decay_rate**(epoch // decay_steps) | |
# batch normalization | |
def batchnorm_forward(layer, x, is_training=True): | |
mu = np.mean(x, axis=0) if is_training else layer._pop_mean | |
var = np.var(x, axis=0) if is_training else layer._pop_var | |
x_norm = (x - mu) / np.sqrt(var + 1e-8) | |
out = layer.gamma * x_norm + layer.beta | |
if is_training: | |
layer._pop_mean = layer.bn_decay * layer._pop_mean + (1.0-layer.bn_decay)*mu | |
layer._pop_var = layer.bn_decay * layer._pop_var + (1.0-layer.bn_decay)*var | |
layer._bn_cache = (x, x_norm, mu, var) | |
return out | |
def batchnorm_backward(layer, dactivation): | |
x, x_norm, mu, var = layer._bn_cache | |
m = layer._activ_inp.shape[0] | |
x_mu = x - mu | |
std_inv = 1. / np.sqrt(var + 1e-8) | |
dx_norm = dactivation * layer.gamma | |
dvar = np.sum(dx_norm * x_mu, axis=0) * -0.5 * (std_inv**3) | |
dmu = np.sum(dx_norm * -std_inv, axis=0) + dvar * np.mean(-2.0 * x_mu, axis=0) | |
dx = (dx_norm * std_inv) + (dvar * 2.0 * x_mu / m) + (dmu / m) | |
layer._dgamma = np.sum(dactivation * x_norm, axis=0) | |
layer._dbeta = np.sum(dactivation, axis=0) | |
return dx | |
# grad check | |
def __compute_approx_grads(nn, x, y, eps=1e-4): | |
approx_grads = [] | |
feed_forward = lambda inp: nn._NeuralNetwork__feedforward(inp, is_training=True) | |
for layer in nn.layers: | |
assert(layer.dropout_prob == 0.0), "O Gradient Checking não pode ser aplicado em redes com DROPOUT" | |
w_ori = layer.weights.copy() | |
w_ravel = w_ori.ravel() | |
w_shape = w_ori.shape | |
for i in range(w_ravel.size): | |
w_plus = w_ravel.copy() | |
w_plus[i] += eps | |
layer.weights = w_plus.reshape(w_shape) | |
J_plus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])*layer.reg_strength*layer.reg_func(layer.weights) | |
w_minus = w_ravel.copy() | |
w_minus[i] -= eps | |
layer.weights = w_minus.reshape(w_shape) | |
J_minus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])*layer.reg_strength*layer.reg_func(layer.weights) | |
approx_grads.append((J_plus - J_minus) / (2.0*eps)) | |
layer.weights = w_ori | |
return approx_grads | |
def gradient_checking(nn, x, y, eps=1e-4, verbose=False, verbose_precision=5): | |
from copy import deepcopy | |
nn_copy = deepcopy(nn) | |
nn.fit(x, y, epochs=0) | |
grads = np.concatenate([layer._dweights.ravel() for layer in nn.layers]) | |
approx_grads = __compute_approx_grads(nn_copy, x, y, eps) | |
is_close = np.allclose(grads, approx_grads) | |
print("{}".format("\033[92mGRADIENTS OK" if is_close else "\033[91mGRADIENTS FAIL")) | |
norm_num = np.linalg.norm(grads - approx_grads) | |
norm_den = np.linalg.norm(grads) + np.linalg.norm(approx_grads) | |
error = norm_num / norm_den | |
print("Relative error:", error) | |
if verbose: | |
np.set_printoptions(precision=verbose_precision, linewidth=200, suppress=True) | |
print("Gradientes:", grads) | |
print("Aproximado:", np.array(approx_grads)) | |
# implementation | |
class Layer(): | |
def __init__(self, input_dim, output_dim, activation=linear, weights_initializer=random_normal, biases_initializer=ones, dropout_prob=0.0, reg_func=l2_regularization, reg_strength=0.0, batch_norm=False, bn_decay=0.9, is_trainable=True): | |
self.input = None | |
self.weights = weights_initializer(output_dim, input_dim) | |
self.biases = biases_initializer(1, output_dim) | |
self.activation = activation | |
self.dropout_prob = dropout_prob | |
self.reg_func = reg_func | |
self.reg_strength = reg_strength | |
self.batch_norm = batch_norm | |
self.bn_decay = bn_decay | |
self.gamma, self.beta = ones(1, output_dim), zeros(1, output_dim) | |
self.is_trainable = is_trainable | |
self._activ_inp, self._activ_out = None, None | |
self._dweights, self._dbiases, self._prev_dweights = None, None, 0.0 | |
self._dropout_mask = None | |
self._dgamma, self._dbeta = None, None | |
self._pop_mean, self._pop_var = zeros(1, output_dim), zeros(1, output_dim) | |
self._bn_cache = None | |
class NeuralNetwork(): | |
def __init__(self, cost_func=mse, learning_rate=1e-3, lr_decay_method=none_decay, lr_decay_rate=0.0, lr_decay_steps=1, momentum=0.0, patience=np.inf): | |
self.layers = [] | |
self.cost_func = cost_func | |
self.learning_rate = self.lr_initial = learning_rate | |
self.lr_decay_method = lr_decay_method | |
self.lr_decay_rate = lr_decay_rate | |
self.lr_decay_steps = lr_decay_steps | |
self.momentum = momentum | |
self.patience, self.waiting = patience, 0 | |
self._best_model, self._best_loss = self.layers, np.inf | |
def fit(self, x_train, y_train, x_val=None, y_val=None, epochs=100, verbose=10, batch_gen=batch_sequential, batch_size=None): | |
x_val, y_val = (x_train, y_train) if (x_val is None or y_val is None) else (x_val, y_val) | |
for epoch in range(epochs+1): | |
self.learning_rate = self.lr_decay_method(self.lr_initial, epoch, self.lr_decay_rate, self.lr_decay_steps) | |
for x_batch, y_batch in batch_gen(x_train, y_train, batch_size): | |
y_pred = self.__feedforward(x_batch) | |
self.__backprop(y_batch, y_pred) | |
loss_val = self.cost_func(y_val, self.predict(x_val)) | |
if loss_val < self._best_loss: | |
self._best_model, self._best_loss = self.layers, loss_val | |
self.waiting = 0 | |
else: | |
self.waiting += 1 | |
if self.waiting >= self.patience: | |
self.layers = self._best_model | |
return | |
if epoch % verbose == 0: | |
loss_train = self.cost_func(y_train, self.predict(x_train)) | |
loss_reg = (1.0/y_train.shape[0])*np.sum([layer.reg_strength * layer.reg_func(layer.weights) for layer in self.layers]) | |
print("epoch: {0:=4}/{1} loss_train: {2:.8f} + {3:.8f} = {4:.8f} loss_val = {5:.8f}".format(epoch, epochs, loss_train, loss_reg, loss_train + loss_reg, loss_val)) | |
def predict(self, x): | |
return self.__feedforward(x, is_training=False) | |
def save(self, file_path): | |
pkl.dump(self, open(file_path, 'wb'), -1) | |
def load(file_path): | |
return pkl.load(open(file_path, 'rb')) | |
def __feedforward(self, x, is_training=True): | |
self.layers[0].input = x | |
for current_layer, next_layer in zip(self.layers, self.layers[1:] + [Layer(0, 0)]): | |
y = np.dot(current_layer.input, current_layer.weights.T) + current_layer.biases | |
y = batchnorm_forward(current_layer, y, is_training) if current_layer.batch_norm else y | |
current_layer._dropout_mask = np.random.binomial(1, 1.0-current_layer.dropout_prob, y.shape) / (1.0-current_layer.dropout_prob) | |
current_layer._activ_inp = y | |
current_layer._activ_out = current_layer.activation(y) * (current_layer._dropout_mask if is_training else 1.0) | |
next_layer.input = current_layer._activ_out | |
return self.layers[-1]._activ_out | |
def __backprop(self, y, y_pred): | |
last_delta = self.cost_func(y, y_pred, derivative=True) | |
for layer in reversed(self.layers): | |
dactivation = layer.activation(layer._activ_inp, derivative=True) * last_delta * layer._dropout_mask | |
dactivation = batchnorm_backward(layer, dactivation) if layer.batch_norm else dactivation | |
last_delta = np.dot(dactivation, layer.weights) | |
layer._dweights = np.dot(dactivation.T, layer.input) | |
layer._dbiases = 1.0*dactivation.sum(axis=0, keepdims=True) | |
for layer in reversed(self.layers): | |
if layer.is_trainable: | |
layer._dweights = layer._dweights + (1.0/y.shape[0]) * layer.reg_strength * layer.reg_func(layer.weights, derivative=True) | |
layer._prev_dweights = -self.learning_rate*layer._dweights + self.momentum*layer._prev_dweights | |
layer.weights = layer.weights + layer._prev_dweights | |
layer.biases = layer.biases - self.learning_rate*layer._dbiases | |
if layer.batch_norm: | |
layer.gamma = layer.gamma - self.learning_rate*layer._dgamma | |
layer.beta = layer.beta - self.learning_rate*layer._dbeta | |
# example 1 | |
print("------------ example 1 ------------") | |
x = np.array([[0.05, 0.10]]) | |
y = np.array([[0.01, 0.99]]) | |
D_in, D_out = x.shape[1], y.shape[1] | |
nn = NeuralNetwork(cost_func=mse, learning_rate=0.5) | |
nn.layers.append(Layer(input_dim=D_in, output_dim=2, activation=sigmoid)) | |
nn.layers.append(Layer(input_dim=2, output_dim=D_out, activation=sigmoid)) | |
w1 = np.array([[0.15, 0.20], [0.25, 0.30]]) | |
b1 = np.array([[0.35]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2) | |
w2 = np.array([[0.40, 0.45], [0.50, 0.55]]) | |
b2 = np.array([[0.60]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2) | |
nn.layers[0].weights = w1 | |
nn.layers[0].biases = b1 | |
nn.layers[1].weights = w2 | |
nn.layers[1].biases = b2 | |
nn.fit(x, y, epochs=0, verbose=1) | |
for layer in nn.layers: | |
print(layer.weights) | |
# example 2 | |
print() | |
print("------------ example 2 ------------") | |
x = np.array([[0.1, 0.2, 0.7]]) | |
y = np.array([[1, 0, 0]]) | |
D_in, D_out = x.shape[1], y.shape[1] | |
nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=0.01) | |
nn.layers.append(Layer(input_dim=D_in, output_dim=3, activation=relu)) | |
nn.layers.append(Layer(input_dim=3, output_dim=3, activation=sigmoid)) | |
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear)) | |
w1 = np.array([[0.1, 0.2, 0.3], [0.3, 0.2, 0.7], [0.4, 0.3, 0.9]]) | |
b1 = np.ones((1,3)) | |
w2 = np.array([[0.2, 0.3, 0.5], [0.3, 0.5, 0.7], [0.6, 0.4, 0.8]]) | |
b2 = np.ones((1,3)) | |
w3 = np.array([[0.1, 0.4, 0.8], [0.3, 0.7, 0.2], [0.5, 0.2, 0.9]]) | |
b3 = np.ones((1,3)) | |
for i, w, b in zip(range(3), [w1, w2, w3], [b1, b2, b3]): | |
nn.layers[i].weights = w | |
nn.layers[i].biases = b | |
nn.fit(x, y, epochs=300, verbose=30) | |
for layer in nn.layers: | |
print(layer.weights) | |
nn.save('model.pkl') | |
# restart notebook and create new cell | |
nn = NeuralNetwork.load('model.pkl') | |
for layer in nn.layers: | |
print(layer.weights) | |
# gradient checking | |
print() | |
print("------------ grad. check ------------") | |
np.random.seed(1234) | |
N, D = 100, 2 | |
x = np.random.rand(N, D) | |
y = np.random.rand(N, 1) | |
# regression | |
D_in, D_out = x.shape[1], y.shape[1] | |
nn = NeuralNetwork(cost_func=mse, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10) | |
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4)) | |
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2)) | |
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True)) | |
nn.fit(x, y, epochs=100) | |
gradient_checking(nn, x, y, eps=1e-4, verbose=True) | |
# binary classification | |
y = np.random.randint(0, 2, (N, 1)) | |
D_in, D_out = x.shape[1], y.shape[1] | |
nn = NeuralNetwork(cost_func=sigmoid_cross_entropy, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10) | |
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4)) | |
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2)) | |
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True)) | |
nn.fit(x, y, epochs=100) | |
gradient_checking(nn, x, y, eps=1e-4, verbose=False) | |
# multiclass classification | |
from sklearn.preprocessing import OneHotEncoder | |
y = np.random.randint(0, 2, (N, 1)) | |
y_oh = OneHotEncoder(sparse=False).fit_transform(y) | |
D_in, D_out = x.shape[1], y_oh.shape[1] | |
nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10) | |
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4)) | |
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2)) | |
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True)) | |
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True)) | |
nn.fit(x, y, epochs=100) | |
gradient_checking(nn, x, y_oh, eps=1e-4, verbose=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment