ethen8181 · September 27, 2018 19:34
diff --git a/nnet.py b/nnet.py
 import numpy as np
 from tqdm import trange
 from scipy.special import expit
 from sklearn.base import BaseEstimator
 from copy import deepcopy


 class NeuralNet(BaseEstimator):
    """
    Neural Network for classification

    Parameters
    ----------
    learning_rate : float
        learning rate for gradient descent

    hidden_dims : list of int
        number of units in the hidden layer, e.g. [30], one hidden layer
        with 30 units; [50, 50], two hidden layer with 50 units each

    n_iters : int
        number of iterations to run the algorithm, a.k.a. epochs

    activation : str, 'relu' or 'tanh'
        activation function after the fully connected layer

    reg : float
        regularization for the weights

    initialize : str, 'xavier' or 'normal'
        weight initialization methods

    seed : int
        seed for the randomly initialized weights
    """
    def __init__(self, learning_rate, hidden_dims, n_iters, 
                 activation, reg, initialize, seed):
        self.reg = reg
        self.seed = seed
        self.n_iters = n_iters
        self.initialize = initialize
        self.activation = activation
        self.hidden_dims = hidden_dims
        self.learning_rate = learning_rate
        
    def fit(self, X, y):
        """
        Parameters
        ----------
        X : 2d numpy array, shape = [n_samples, n_features]
            The training input samples
        
        y : 1d numpy array, shape = [n_samples]
            the target values, a.k.a class labels in classification
        """
        N, n_features = X.shape
        n_classes = np.unique(y).shape[0]

        # initialize random weights, we need to learn these
        self.biases = []
        self.weights = []
        dims = [n_features] + self.hidden_dims + [n_classes]
        
        rstate = np.random.RandomState(self.seed)
        for d in range(0, len(dims) - 1):
            if self.initialize == 'xavier':
                weight = rstate.normal(0, 2 / dims[d], size = (dims[d], dims[d + 1]))
            elif self.initialize == 'normal':
                weight = rstate.randn(dims[d], dims[d + 1])
            
            bias = np.zeros((1, dims[d + 1]))
            self.weights.append(weight)
            self.biases.append(bias)

        # iterate between forward and backpropagation steps to
        # train the neural network and store the loss and accuracy history
        self.losses = []
        self.accuracies = []
        for _ in trange(self.n_iters):
            proba, caches = self._forward_pass(X)
            self._backward_pass(proba, caches, y, i)
            
            loss = softmax_loss(proba, y, self.weights, self.reg)
            self.losses.append(loss)
            
            y_pred = np.argmax(proba, axis = 1)
            accuracy = np.sum(y_pred == y) / N
            self.accuracies.append(accuracy * 100)
        
        return self

    def _forward_pass(self, X):
        """
        feed forward: 
        given the input data, output the softmax probability
        and a cache list that contains the information needed
        to do the backpropagation
        """
        f, f_cache = feed_forward(X, self.weights[0], self.biases[0])
        caches = [f_cache]
        for weight, bias in zip(self.weights[1:], self.biases[1:]):
            activation_forward = ACTIVATION[self.activation]['forward']
            a, a_cache = activation_forward(f)
            f, f_cache = feed_forward(a, weight, bias)
            caches.append(a_cache)
            caches.append(f_cache)

        proba = softmax_forward(f)
        return proba, caches
    
    def _backward_pass(self, proba, caches, y, i):
        """backpropagation that updates the weights"""
        dout = softmax_backward(proba, y)
        cache = caches.pop()
        dx, dw, db = feed_backward(dout, cache)
        dbiases = [db]
        dweights = [dw]

        for _ in range(len(caches) // 2):
            cache = caches.pop()
            activation_backward = ACTIVATION[self.activation]['backward']
            # da = tanh_backward(dx, cache)
            da = activation_backward(dx, cache)
            cache = caches.pop()
            dx, dw, db = feed_backward(da, cache)
            dbiases.append(db)
            dweights.append(dw)

        # regularization
        dweights = [dw + self.reg * dw for dw in dweights]
        
        # update the weights using standard gradient descent,
        # note that the first element of the dweight corresponds
        # to the last element of weights
        w_len = len(self.weights) - 1
        for j in range(w_len):
            self.weights[w_len - j] -= self.learning_rate * dweights[j]
            self.biases[w_len - j] -= self.learning_rate * dbiases[j]
        
        return self
    
    def predict(self, X):
        proba = self.predict_proba(X)
        y_pred = np.argmax(proba, axis = 1)
        return y_pred
    
    def predict_proba(self, X):
        proba, _ = self._forward_pass(X)
        return proba



 def softmax_loss(proba, y, weights, reg):
    """loss is averaged by the number of samples"""
    N = y.shape[0]
    
    # add an epsilon value to prevent taking log of 0
    log_proba = -np.log(proba[range(N), y] + 1e-9)
    data_loss = np.sum(log_proba) / N
    
    # regularization for the weights
    weights_sum = np.sum([np.sum(w ** 2) for w in weights])
    reg_loss = 0.5 * reg * weights_sum
    loss = data_loss + reg_loss
    return loss

 def softmax_forward(x):
    """
    compute the softmax of matrix x in a numerically stable way,
    by substracting each row with the max of each row
    """
    shift_x = x - np.amax(x, axis = 1, keepdims = 1)
    exp_x = np.exp(shift_x)
    proba = exp_x / np.sum(exp_x, axis = 1, keepdims = 1)
    return proba

 def softmax_backward(proba, y):
    N = y.shape[0]
    dx = proba.copy()
    dx[range(N), y] -= 1
    dx /= N
    return dx

 def feed_forward(x, w, b):
    f = x.dot(w) + b
    f_cache = x, w
    return f, f_cache

 def feed_backward(dout, cache):
    x, w = cache
    
    # gradient of w, can be computed by matrix multiplication
    # with the dout. Just be careful with the dimensions of the output,
    # e.g. we know that the gradient on the weights dw must be of the 
    # same shape as the w matrix
    dx = dout.dot(w.T)
    dw = x.T.dot(dout)
    db = np.sum(dout, axis = 0)
    return dx, dw, db

 def relu_forward(x):
    a = np.maximum(0, x)
    a_cache = x
    return a, a_cache

 def relu_backward(dout, cache):
    dx = np.where(cache > 0, dout, 0)
    return dx

 def leaky_relu_forward(x):
    a = np.maximum(0.01 * x, x)
    a_cache = x
    return a, a_cache

 def leaky_relu_backward(dout, cache):
    dx = np.where(cache > 0, dout, 0.01)
    return dx

 def tanh_forward(x):
    a = np.tanh(x)
    a_cache = x
    return a, a_cache

 def tanh_backward(dout, cache):
    dx = 1 - cache ** 2
    return dx


 ACTIVATION = {}
 ACTIVATION['relu'] = {
    'forward': relu_forward,
    'backward': relu_backward
 }
 ACTIVATION['tanh'] = {
    'forward': tanh_forward,
    'backward': tanh_backward
 }
 ACTIVATION['leaky_relu'] = {
    'forward': leaky_relu_forward,
    'backward': leaky_relu_backward
 }


 __all__ = [NeuralNet]


 if __name__ == '__main__':
    import matplotlib.pyplot as plt
    from keras.datasets.mnist import load_data
    from sklearn.metrics import accuracy_score

    def plot_info(estimator, X, y, figname = 'history.png'):
        """
        select a random sample from the dataset, 
        visualize the image its corresponding prediction and it's 
        confidence of the prediction, i.e. predicted probability;
        also visualize the stored loss and accuracy up to the 
        current iteration; the utility function will also store the
        visualization to disk, change figname to None to not have
        this behavior
        """
        fig, ax = plt.subplots(1, 3, figsize = (12,3))
        
        # evaluate overall accuracy
        y_pred = estimator.predict(X)
        accuracy = accuracy_score(y, y_pred)
        title = 'Overall accuracy %0.2f' % accuracy
        
        # reshape the randomly chosen image to a square
        i = np.random.choice(X.shape[0])
        size = np.sqrt(X.shape[1]).astype(np.int)
        img = X[i].reshape(size, size)
        ax[0].imshow(img, cmap = 'gray')
        
        # prediction for the randomly chosen image
        proba = estimator.predict_proba(X[i])
        y_pred = np.argmax(proba)
        title += "\nPrediction: %d confidence=%0.2f" % (y_pred, proba[0][y_pred])

        ax[0].set_title(title)
        ax[0].set_xticks([])
        ax[0].set_yticks([])

        ax[1].plot(estimator.losses, color = 'blue')
        ax[1].set_title('Loss')
        ax[1].set_yscale('log')

        # aim for 90% accuracy
        ax[2].plot(estimator.accuracies, color = 'blue')
        ax[2].axhline(90, color = 'red', linestyle = ':')
        ax[2].set_title('Accuracy: %0.2f%%' % estimator.accuracies[-1])

        # modify the figure size to add a little height,
        # this prevents some text to be chopped off
        size = fig.get_size_inches()
        fig.set_size_inches(size[0], size[1] + 1)  
        if figname is not None:
            fig.savefig(figname)
        
        plt.show()
    
    
    # load mnist dataset, and normalize them
    (X_train, y_train), (X_test, y_test) = load_data()
    X_train = X_train.reshape((X_train.shape[0], -1)) / 255.0
    X_test = X_test.reshape((X_test.shape[0], -1)) / 255.0

    # train neural network model 
    nn_params = {
        'reg': 0.01,
        'seed': 1234,
        'n_iters': 350,
        'hidden_dims': [512, 512],
        'learning_rate': 0.01,
        'activation': 'leaky_relu',
        'initialize': 'xavier'
    }
    nn = NeuralNet(**nn_params)
    nn.fit(X_train, y_train)
    plot_info(nn, X_test, y_test)
	import numpy as np
	from tqdm import trange
	from scipy.special import expit
	from sklearn.base import BaseEstimator
	from copy import deepcopy


	class NeuralNet(BaseEstimator):
	"""
	Neural Network for classification

	Parameters
	----------
	learning_rate : float
	learning rate for gradient descent

	hidden_dims : list of int
	number of units in the hidden layer, e.g. [30], one hidden layer
	with 30 units; [50, 50], two hidden layer with 50 units each

	n_iters : int
	number of iterations to run the algorithm, a.k.a. epochs

	activation : str, 'relu' or 'tanh'
	activation function after the fully connected layer

	reg : float
	regularization for the weights

	initialize : str, 'xavier' or 'normal'
	weight initialization methods

	seed : int
	seed for the randomly initialized weights
	"""
	def __init__(self, learning_rate, hidden_dims, n_iters,
	activation, reg, initialize, seed):
	self.reg = reg
	self.seed = seed
	self.n_iters = n_iters
	self.initialize = initialize
	self.activation = activation
	self.hidden_dims = hidden_dims
	self.learning_rate = learning_rate

	def fit(self, X, y):
	"""
	Parameters
	----------
	X : 2d numpy array, shape = [n_samples, n_features]
	The training input samples

	y : 1d numpy array, shape = [n_samples]
	the target values, a.k.a class labels in classification
	"""
	N, n_features = X.shape
	n_classes = np.unique(y).shape[0]

	# initialize random weights, we need to learn these
	self.biases = []
	self.weights = []
	dims = [n_features] + self.hidden_dims + [n_classes]

	rstate = np.random.RandomState(self.seed)
	for d in range(0, len(dims) - 1):
	if self.initialize == 'xavier':
	weight = rstate.normal(0, 2 / dims[d], size = (dims[d], dims[d + 1]))
	elif self.initialize == 'normal':
	weight = rstate.randn(dims[d], dims[d + 1])

	bias = np.zeros((1, dims[d + 1]))
	self.weights.append(weight)
	self.biases.append(bias)

	# iterate between forward and backpropagation steps to
	# train the neural network and store the loss and accuracy history
	self.losses = []
	self.accuracies = []
	for _ in trange(self.n_iters):
	proba, caches = self._forward_pass(X)
	self._backward_pass(proba, caches, y, i)

	loss = softmax_loss(proba, y, self.weights, self.reg)
	self.losses.append(loss)

	y_pred = np.argmax(proba, axis = 1)
	accuracy = np.sum(y_pred == y) / N
	self.accuracies.append(accuracy * 100)

	return self

	def _forward_pass(self, X):
	"""
	feed forward:
	given the input data, output the softmax probability
	and a cache list that contains the information needed
	to do the backpropagation
	"""
	f, f_cache = feed_forward(X, self.weights[0], self.biases[0])
	caches = [f_cache]
	for weight, bias in zip(self.weights[1:], self.biases[1:]):
	activation_forward = ACTIVATION[self.activation]['forward']
	a, a_cache = activation_forward(f)
	f, f_cache = feed_forward(a, weight, bias)
	caches.append(a_cache)
	caches.append(f_cache)

	proba = softmax_forward(f)
	return proba, caches

	def _backward_pass(self, proba, caches, y, i):
	"""backpropagation that updates the weights"""
	dout = softmax_backward(proba, y)
	cache = caches.pop()
	dx, dw, db = feed_backward(dout, cache)
	dbiases = [db]
	dweights = [dw]

	for _ in range(len(caches) // 2):
	cache = caches.pop()
	activation_backward = ACTIVATION[self.activation]['backward']
	# da = tanh_backward(dx, cache)
	da = activation_backward(dx, cache)
	cache = caches.pop()
	dx, dw, db = feed_backward(da, cache)
	dbiases.append(db)
	dweights.append(dw)

	# regularization
	dweights = [dw + self.reg * dw for dw in dweights]

	# update the weights using standard gradient descent,
	# note that the first element of the dweight corresponds
	# to the last element of weights
	w_len = len(self.weights) - 1
	for j in range(w_len):
	self.weights[w_len - j] -= self.learning_rate * dweights[j]
	self.biases[w_len - j] -= self.learning_rate * dbiases[j]

	return self

	def predict(self, X):
	proba = self.predict_proba(X)
	y_pred = np.argmax(proba, axis = 1)
	return y_pred

	def predict_proba(self, X):
	proba, _ = self._forward_pass(X)
	return proba



	def softmax_loss(proba, y, weights, reg):
	"""loss is averaged by the number of samples"""
	N = y.shape[0]

	# add an epsilon value to prevent taking log of 0
	log_proba = -np.log(proba[range(N), y] + 1e-9)
	data_loss = np.sum(log_proba) / N

	# regularization for the weights
	weights_sum = np.sum([np.sum(w ** 2) for w in weights])
	reg_loss = 0.5 * reg * weights_sum
	loss = data_loss + reg_loss
	return loss

	def softmax_forward(x):
	"""
	compute the softmax of matrix x in a numerically stable way,
	by substracting each row with the max of each row
	"""
	shift_x = x - np.amax(x, axis = 1, keepdims = 1)
	exp_x = np.exp(shift_x)
	proba = exp_x / np.sum(exp_x, axis = 1, keepdims = 1)
	return proba

	def softmax_backward(proba, y):
	N = y.shape[0]
	dx = proba.copy()
	dx[range(N), y] -= 1
	dx /= N
	return dx

	def feed_forward(x, w, b):
	f = x.dot(w) + b
	f_cache = x, w
	return f, f_cache

	def feed_backward(dout, cache):
	x, w = cache

	# gradient of w, can be computed by matrix multiplication
	# with the dout. Just be careful with the dimensions of the output,
	# e.g. we know that the gradient on the weights dw must be of the
	# same shape as the w matrix
	dx = dout.dot(w.T)
	dw = x.T.dot(dout)
	db = np.sum(dout, axis = 0)
	return dx, dw, db

	def relu_forward(x):
	a = np.maximum(0, x)
	a_cache = x
	return a, a_cache

	def relu_backward(dout, cache):
	dx = np.where(cache > 0, dout, 0)
	return dx

	def leaky_relu_forward(x):
	a = np.maximum(0.01 * x, x)
	a_cache = x
	return a, a_cache

	def leaky_relu_backward(dout, cache):
	dx = np.where(cache > 0, dout, 0.01)
	return dx

	def tanh_forward(x):
	a = np.tanh(x)
	a_cache = x
	return a, a_cache

	def tanh_backward(dout, cache):
	dx = 1 - cache ** 2
	return dx


	ACTIVATION = {}
	ACTIVATION['relu'] = {
	'forward': relu_forward,
	'backward': relu_backward
	}
	ACTIVATION['tanh'] = {
	'forward': tanh_forward,
	'backward': tanh_backward
	}
	ACTIVATION['leaky_relu'] = {
	'forward': leaky_relu_forward,
	'backward': leaky_relu_backward
	}


	__all__ = [NeuralNet]


	if __name__ == '__main__':
	import matplotlib.pyplot as plt
	from keras.datasets.mnist import load_data
	from sklearn.metrics import accuracy_score

	def plot_info(estimator, X, y, figname = 'history.png'):
	"""
	select a random sample from the dataset,
	visualize the image its corresponding prediction and it's
	confidence of the prediction, i.e. predicted probability;
	also visualize the stored loss and accuracy up to the
	current iteration; the utility function will also store the
	visualization to disk, change figname to None to not have
	this behavior
	"""
	fig, ax = plt.subplots(1, 3, figsize = (12,3))

	# evaluate overall accuracy
	y_pred = estimator.predict(X)
	accuracy = accuracy_score(y, y_pred)
	title = 'Overall accuracy %0.2f' % accuracy

	# reshape the randomly chosen image to a square
	i = np.random.choice(X.shape[0])
	size = np.sqrt(X.shape[1]).astype(np.int)
	img = X[i].reshape(size, size)
	ax[0].imshow(img, cmap = 'gray')

	# prediction for the randomly chosen image
	proba = estimator.predict_proba(X[i])
	y_pred = np.argmax(proba)
	title += "\nPrediction: %d confidence=%0.2f" % (y_pred, proba[0][y_pred])

	ax[0].set_title(title)
	ax[0].set_xticks([])
	ax[0].set_yticks([])

	ax[1].plot(estimator.losses, color = 'blue')
	ax[1].set_title('Loss')
	ax[1].set_yscale('log')

	# aim for 90% accuracy
	ax[2].plot(estimator.accuracies, color = 'blue')
	ax[2].axhline(90, color = 'red', linestyle = ':')
	ax[2].set_title('Accuracy: %0.2f%%' % estimator.accuracies[-1])

	# modify the figure size to add a little height,
	# this prevents some text to be chopped off
	size = fig.get_size_inches()
	fig.set_size_inches(size[0], size[1] + 1)
	if figname is not None:
	fig.savefig(figname)

	plt.show()


	# load mnist dataset, and normalize them
	(X_train, y_train), (X_test, y_test) = load_data()
	X_train = X_train.reshape((X_train.shape[0], -1)) / 255.0
	X_test = X_test.reshape((X_test.shape[0], -1)) / 255.0

	# train neural network model
	nn_params = {
	'reg': 0.01,
	'seed': 1234,
	'n_iters': 350,
	'hidden_dims': [512, 512],
	'learning_rate': 0.01,
	'activation': 'leaky_relu',
	'initialize': 'xavier'
	}
	nn = NeuralNet(**nn_params)
	nn.fit(X_train, y_train)
	plot_info(nn, X_test, y_test)