roycoding · November 16, 2016 21:51
diff --git a/Intro to Neural Networks.ipynb b/Intro to Neural Networks.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Introduction to Neural Networks\n",
    "### Roy Keyes\n",
    "#### @roycoding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Neural network Python code from Michael Nielsen's [\"Neural Networks and Deep Learning\"](http://neuralnetworksanddeeplearning.com), chapter 1.\n",
    "\n",
    "Github: [https://github.com/mnielsen/neural-networks-and-deep-learning](https://github.com/mnielsen/neural-networks-and-deep-learning)\n",
    "\n",
    "The below code is modified for readability and compatibility with Python 3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import random\n",
    "import pickle\n",
    "import gzip"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The neural network class with training and optimization functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class Network(object):\n",
    "\n",
    "    def __init__(self, sizes):\n",
    "        \"\"\"The list ``sizes`` contains the number of neurons in the\n",
    "        respective layers of the network.  For example, if the list\n",
    "        was [2, 3, 1] then it would be a three-layer network, with the\n",
    "        first layer containing 2 neurons, the second layer 3 neurons,\n",
    "        and the third layer 1 neuron.  The biases and weights for the\n",
    "        network are initialized randomly, using a Gaussian\n",
    "        distribution with mean 0, and variance 1.  Note that the first\n",
    "        layer is assumed to be an input layer, and by convention we\n",
    "        won't set any biases for those neurons, since biases are only\n",
    "        ever used in computing the outputs from later layers.\"\"\"\n",
    "        \n",
    "        self.num_layers = len(sizes)\n",
    "        self.sizes = sizes\n",
    "        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]\n",
    "        self.weights = [np.random.randn(y, x)\n",
    "                        for x, y in zip(sizes[:-1], sizes[1:])]\n",
    "\n",
    "        \n",
    "    def feedforward(self, a):\n",
    "        \"\"\"Return the output of the network if ``a`` is input.\"\"\"\n",
    "        for b, w in zip(self.biases, self.weights):\n",
    "            a = sigmoid(np.dot(w, a)+b)\n",
    "        return a\n",
    "\n",
    "    \n",
    "    def SGD(self, training_data, epochs, mini_batch_size, eta,\n",
    "            test_data=None):\n",
    "        \"\"\"Train the neural network using mini-batch stochastic\n",
    "        gradient descent.  The ``training_data`` is a list of tuples\n",
    "        ``(x, y)`` representing the training inputs and the desired\n",
    "        outputs.  The other non-optional parameters are\n",
    "        self-explanatory.  If ``test_data`` is provided then the\n",
    "        network will be evaluated against the test data after each\n",
    "        epoch, and partial progress printed out.  This is useful for\n",
    "        tracking progress, but slows things down substantially.\"\"\"\n",
    "        \n",
    "        if test_data: n_test = len(test_data)\n",
    "        n = len(training_data)\n",
    "        for j in range(epochs):   # Modified for Python 3\n",
    "            random.shuffle(training_data)\n",
    "            mini_batches = [\n",
    "                training_data[k:k+mini_batch_size]\n",
    "                for k in range(0, n, mini_batch_size)]    # Modified for Python 3\n",
    "            for mini_batch in mini_batches:\n",
    "                self.update_mini_batch(mini_batch, eta)\n",
    "            if test_data:\n",
    "                print(\"Epoch {0}: {1} / {2}\".format(j, self.evaluate(test_data), n_test))\n",
    "            else:\n",
    "                print(\"Epoch {0} complete\".format(j))\n",
    "\n",
    "                \n",
    "    def update_mini_batch(self, mini_batch, eta):\n",
    "        \"\"\"Update the network's weights and biases by applying\n",
    "        gradient descent using backpropagation to a single mini batch.\n",
    "        The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``\n",
    "        is the learning rate.\"\"\"\n",
    "        \n",
    "        nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
    "        nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
    "        for x, y in mini_batch:\n",
    "            delta_nabla_b, delta_nabla_w = self.backprop(x, y)\n",
    "            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]\n",
    "            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]\n",
    "        self.weights = [w-(eta/len(mini_batch))*nw\n",
    "                        for w, nw in zip(self.weights, nabla_w)]\n",
    "        self.biases = [b-(eta/len(mini_batch))*nb\n",
    "                       for b, nb in zip(self.biases, nabla_b)]\n",
    "\n",
    "        \n",
    "    def backprop(self, x, y):\n",
    "        \"\"\"Return a tuple ``(nabla_b, nabla_w)`` representing the\n",
    "        gradient for the cost function C_x.  ``nabla_b`` and\n",
    "        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar\n",
    "        to ``self.biases`` and ``self.weights``.\"\"\"\n",
    "        \n",
    "        nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
    "        nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
    "        # feedforward\n",
    "        activation = x\n",
    "        activations = [x] # list to store all the activations, layer by layer\n",
    "        zs = [] # list to store all the z vectors, layer by layer\n",
    "        for b, w in zip(self.biases, self.weights):\n",
    "            z = np.dot(w, activation)+b\n",
    "            zs.append(z)\n",
    "            activation = sigmoid(z)\n",
    "            activations.append(activation)\n",
    "        # backward pass\n",
    "        delta = self.cost_derivative(activations[-1], y) * \\\n",
    "            sigmoid_prime(zs[-1])\n",
    "        nabla_b[-1] = delta\n",
    "        nabla_w[-1] = np.dot(delta, activations[-2].transpose())\n",
    "        \n",
    "        # Note that the variable l in the loop below is used a little\n",
    "        # differently to the notation in Chapter 2 of the book.  Here,\n",
    "        # l = 1 means the last layer of neurons, l = 2 is the\n",
    "        # second-last layer, and so on.  It's a renumbering of the\n",
    "        # scheme in the book, used here to take advantage of the fact\n",
    "        # that Python can use negative indices in lists.\n",
    "        \n",
    "        for l in range(2, self.num_layers):   # Modified for Python 3\n",
    "            z = zs[-l]\n",
    "            sp = sigmoid_prime(z)\n",
    "            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp\n",
    "            nabla_b[-l] = delta\n",
    "            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())\n",
    "        return (nabla_b, nabla_w)\n",
    "\n",
    "    \n",
    "    def evaluate(self, test_data):\n",
    "        \"\"\"Return the number of test inputs for which the neural\n",
    "        network outputs the correct result. Note that the neural\n",
    "        network's output is assumed to be the index of whichever\n",
    "        neuron in the final layer has the highest activation.\"\"\"\n",
    "        \n",
    "        test_results = [(np.argmax(self.feedforward(x)), y)\n",
    "                        for (x, y) in test_data]\n",
    "        return sum(int(x == y) for (x, y) in test_results)\n",
    "\n",
    "    \n",
    "    def cost_derivative(self, output_activations, y):\n",
    "        \"\"\"Return the vector of partial derivatives \\partial C_x /\n",
    "        \\partial a for the output activations.\"\"\"\n",
    "        \n",
    "        return (output_activations-y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Miscellaneous functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def sigmoid(z):\n",
    "    \"\"\"The sigmoid function.\"\"\"\n",
    "    return 1.0/(1.0+np.exp(-z))\n",
    "\n",
    "def sigmoid_prime(z):\n",
    "    \"\"\"Derivative of the sigmoid function.\"\"\"\n",
    "    return sigmoid(z)*(1-sigmoid(z))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Code to load the training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def load_data(filename=None):\n",
    "    \"\"\"Return the MNIST data as a tuple containing the training data,\n",
    "    the validation data, and the test data.\n",
    "\n",
    "    The ``training_data`` is returned as a tuple with two entries.\n",
    "    The first entry contains the actual training images.  This is a\n",
    "    numpy ndarray with 50,000 entries.  Each entry is, in turn, a\n",
    "    numpy ndarray with 784 values, representing the 28 * 28 = 784\n",
    "    pixels in a single MNIST image.\n",
    "\n",
    "    The second entry in the ``training_data`` tuple is a numpy ndarray\n",
    "    containing 50,000 entries.  Those entries are just the digit\n",
    "    values (0...9) for the corresponding images contained in the first\n",
    "    entry of the tuple.\n",
    "\n",
    "    The ``validation_data`` and ``test_data`` are similar, except\n",
    "    each contains only 10,000 images.\n",
    "\n",
    "    This is a nice data format, but for use in neural networks it's\n",
    "    helpful to modify the format of the ``training_data`` a little.\n",
    "    That's done in the wrapper function ``load_data_wrapper()``, see\n",
    "    below.\n",
    "    \"\"\"\n",
    "    \n",
    "    if not filename:\n",
    "        filename = './neural-networks-and-deep-learning/data/mnist.pkl.gz'\n",
    "    f = gzip.open(filename, 'rb')\n",
    "    training_data, validation_data, test_data = pickle.load(f, encoding='latin1') # Encoding needed for Python 3\n",
    "    f.close()\n",
    "    return (training_data, validation_data, test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def load_data_wrapper(filename=None):\n",
    "    \"\"\"Return a tuple containing ``(training_data, validation_data,\n",
    "    test_data)``. Based on ``load_data``, but the format is more\n",
    "    convenient for use in our implementation of neural networks.\n",
    "\n",
    "    In particular, ``training_data`` is a list containing 50,000\n",
    "    2-tuples ``(x, y)``.  ``x`` is a 784-dimensional numpy.ndarray\n",
    "    containing the input image.  ``y`` is a 10-dimensional\n",
    "    numpy.ndarray representing the unit vector corresponding to the\n",
    "    correct digit for ``x``.\n",
    "\n",
    "    ``validation_data`` and ``test_data`` are lists containing 10,000\n",
    "    2-tuples ``(x, y)``.  In each case, ``x`` is a 784-dimensional\n",
    "    numpy.ndarry containing the input image, and ``y`` is the\n",
    "    corresponding classification, i.e., the digit values (integers)\n",
    "    corresponding to ``x``.\n",
    "\n",
    "    Obviously, this means we're using slightly different formats for\n",
    "    the training data and the validation / test data.  These formats\n",
    "    turn out to be the most convenient for use in our neural network\n",
    "    code.\"\"\"\n",
    "    \n",
    "    tr_d, va_d, te_d = load_data(filename=filename)\n",
    "    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]\n",
    "    training_results = [vectorized_result(y) for y in tr_d[1]]\n",
    "    training_data = list(zip(training_inputs, training_results))  # Modified for Python 3\n",
    "    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]\n",
    "    validation_data = list(zip(validation_inputs, va_d[1]))  # Modified for Python 3\n",
    "    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]\n",
    "    test_data = list(zip(test_inputs, te_d[1]))  # Modified for Python 3\n",
    "    return (training_data, validation_data, test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def vectorized_result(j):\n",
    "    \"\"\"Return a 10-dimensional unit vector with a 1.0 in the jth\n",
    "    position and zeroes elsewhere.  This is used to convert a digit\n",
    "    (0...9) into a corresponding desired output from the neural\n",
    "    network.\"\"\"\n",
    "    e = np.zeros((10, 1))\n",
    "    e[j] = 1.0\n",
    "    return e"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train a network"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### First load the data\n",
    "Split data into training and test sets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "training_data, validation_data, test_data = load_data_wrapper()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Initialize a network\n",
    "Hidden layer nodes = 30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "net = Network([784, 30, 10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Train and evaluate!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 0: 9092 / 10000\n",
      "Epoch 1: 9220 / 10000\n",
      "Epoch 2: 9314 / 10000\n",
      "Epoch 3: 9363 / 10000\n",
      "Epoch 4: 9407 / 10000\n",
      "Epoch 5: 9393 / 10000\n",
      "Epoch 6: 9447 / 10000\n",
      "Epoch 7: 9450 / 10000\n",
      "Epoch 8: 9460 / 10000\n",
      "Epoch 9: 9434 / 10000\n",
      "Epoch 10: 9465 / 10000\n",
      "Epoch 11: 9477 / 10000\n",
      "Epoch 12: 9478 / 10000\n",
      "Epoch 13: 9516 / 10000\n",
      "Epoch 14: 9487 / 10000\n",
      "Epoch 15: 9495 / 10000\n",
      "Epoch 16: 9498 / 10000\n",
      "Epoch 17: 9490 / 10000\n",
      "Epoch 18: 9504 / 10000\n",
      "Epoch 19: 9456 / 10000\n",
      "Epoch 20: 9505 / 10000\n",
      "Epoch 21: 9507 / 10000\n",
      "Epoch 22: 9526 / 10000\n",
      "Epoch 23: 9506 / 10000\n",
      "Epoch 24: 9521 / 10000\n",
      "Epoch 25: 9522 / 10000\n",
      "Epoch 26: 9531 / 10000\n",
      "Epoch 27: 9523 / 10000\n",
      "Epoch 28: 9522 / 10000\n",
      "Epoch 29: 9530 / 10000\n"
     ]
    }
   ],
   "source": [
    "net.SGD(training_data, 30, 10, 3.0, test_data=test_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### A larger network\n",
    "Hidden layer nodes = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 0: 6579 / 10000\n",
      "Epoch 1: 7591 / 10000\n",
      "Epoch 2: 7699 / 10000\n",
      "Epoch 3: 7785 / 10000\n",
      "Epoch 4: 7792 / 10000\n",
      "Epoch 5: 7992 / 10000\n",
      "Epoch 6: 8008 / 10000\n",
      "Epoch 7: 8020 / 10000\n",
      "Epoch 8: 7928 / 10000\n",
      "Epoch 9: 8244 / 10000\n",
      "Epoch 10: 8408 / 10000\n",
      "Epoch 11: 8173 / 10000\n",
      "Epoch 12: 8406 / 10000\n",
      "Epoch 13: 8750 / 10000\n",
      "Epoch 14: 9569 / 10000\n",
      "Epoch 15: 9583 / 10000\n",
      "Epoch 16: 9626 / 10000\n",
      "Epoch 17: 9653 / 10000\n",
      "Epoch 18: 9661 / 10000\n",
      "Epoch 19: 9634 / 10000\n",
      "Epoch 20: 9656 / 10000\n",
      "Epoch 21: 9642 / 10000\n",
      "Epoch 22: 9665 / 10000\n",
      "Epoch 23: 9674 / 10000\n",
      "Epoch 24: 9638 / 10000\n",
      "Epoch 25: 9642 / 10000\n",
      "Epoch 26: 9661 / 10000\n",
      "Epoch 27: 9663 / 10000\n",
      "Epoch 28: 9669 / 10000\n",
      "Epoch 29: 9686 / 10000\n"
     ]
    }
   ],
   "source": [
    "net = Network([784, 100, 10])\n",
    "net.SGD(training_data, 30, 10, 6.0, test_data=test_data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"slideshow": {
	"slide_type": "slide"
	}
	},
	"source": [
	"# Introduction to Neural Networks\n",
	"### Roy Keyes\n",
	"#### @roycoding"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Neural network Python code from Michael Nielsen's [\"Neural Networks and Deep Learning\"](http://neuralnetworksanddeeplearning.com), chapter 1.\n",
	"\n",
	"Github: [https://github.com/mnielsen/neural-networks-and-deep-learning](https://github.com/mnielsen/neural-networks-and-deep-learning)\n",
	"\n",
	"The below code is modified for readability and compatibility with Python 3."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import random\n",
	"import pickle\n",
	"import gzip"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## The neural network class with training and optimization functions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"class Network(object):\n",
	"\n",
	" def __init__(self, sizes):\n",
	" \"\"\"The list ``sizes`` contains the number of neurons in the\n",
	" respective layers of the network. For example, if the list\n",
	" was [2, 3, 1] then it would be a three-layer network, with the\n",
	" first layer containing 2 neurons, the second layer 3 neurons,\n",
	" and the third layer 1 neuron. The biases and weights for the\n",
	" network are initialized randomly, using a Gaussian\n",
	" distribution with mean 0, and variance 1. Note that the first\n",
	" layer is assumed to be an input layer, and by convention we\n",
	" won't set any biases for those neurons, since biases are only\n",
	" ever used in computing the outputs from later layers.\"\"\"\n",
	" \n",
	" self.num_layers = len(sizes)\n",
	" self.sizes = sizes\n",
	" self.biases = [np.random.randn(y, 1) for y in sizes[1:]]\n",
	" self.weights = [np.random.randn(y, x)\n",
	" for x, y in zip(sizes[:-1], sizes[1:])]\n",
	"\n",
	" \n",
	" def feedforward(self, a):\n",
	" \"\"\"Return the output of the network if ``a`` is input.\"\"\"\n",
	" for b, w in zip(self.biases, self.weights):\n",
	" a = sigmoid(np.dot(w, a)+b)\n",
	" return a\n",
	"\n",
	" \n",
	" def SGD(self, training_data, epochs, mini_batch_size, eta,\n",
	" test_data=None):\n",
	" \"\"\"Train the neural network using mini-batch stochastic\n",
	" gradient descent. The ``training_data`` is a list of tuples\n",
	" ``(x, y)`` representing the training inputs and the desired\n",
	" outputs. The other non-optional parameters are\n",
	" self-explanatory. If ``test_data`` is provided then the\n",
	" network will be evaluated against the test data after each\n",
	" epoch, and partial progress printed out. This is useful for\n",
	" tracking progress, but slows things down substantially.\"\"\"\n",
	" \n",
	" if test_data: n_test = len(test_data)\n",
	" n = len(training_data)\n",
	" for j in range(epochs): # Modified for Python 3\n",
	" random.shuffle(training_data)\n",
	" mini_batches = [\n",
	" training_data[k:k+mini_batch_size]\n",
	" for k in range(0, n, mini_batch_size)] # Modified for Python 3\n",
	" for mini_batch in mini_batches:\n",
	" self.update_mini_batch(mini_batch, eta)\n",
	" if test_data:\n",
	" print(\"Epoch {0}: {1} / {2}\".format(j, self.evaluate(test_data), n_test))\n",
	" else:\n",
	" print(\"Epoch {0} complete\".format(j))\n",
	"\n",
	" \n",
	" def update_mini_batch(self, mini_batch, eta):\n",
	" \"\"\"Update the network's weights and biases by applying\n",
	" gradient descent using backpropagation to a single mini batch.\n",
	" The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta``\n",
	" is the learning rate.\"\"\"\n",
	" \n",
	" nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
	" nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
	" for x, y in mini_batch:\n",
	" delta_nabla_b, delta_nabla_w = self.backprop(x, y)\n",
	" nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]\n",
	" nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]\n",
	" self.weights = [w-(eta/len(mini_batch))*nw\n",
	" for w, nw in zip(self.weights, nabla_w)]\n",
	" self.biases = [b-(eta/len(mini_batch))*nb\n",
	" for b, nb in zip(self.biases, nabla_b)]\n",
	"\n",
	" \n",
	" def backprop(self, x, y):\n",
	" \"\"\"Return a tuple ``(nabla_b, nabla_w)`` representing the\n",
	" gradient for the cost function C_x. ``nabla_b`` and\n",
	" ``nabla_w`` are layer-by-layer lists of numpy arrays, similar\n",
	" to ``self.biases`` and ``self.weights``.\"\"\"\n",
	" \n",
	" nabla_b = [np.zeros(b.shape) for b in self.biases]\n",
	" nabla_w = [np.zeros(w.shape) for w in self.weights]\n",
	" # feedforward\n",
	" activation = x\n",
	" activations = [x] # list to store all the activations, layer by layer\n",
	" zs = [] # list to store all the z vectors, layer by layer\n",
	" for b, w in zip(self.biases, self.weights):\n",
	" z = np.dot(w, activation)+b\n",
	" zs.append(z)\n",
	" activation = sigmoid(z)\n",
	" activations.append(activation)\n",
	" # backward pass\n",
	" delta = self.cost_derivative(activations[-1], y) * \\\n",
	" sigmoid_prime(zs[-1])\n",
	" nabla_b[-1] = delta\n",
	" nabla_w[-1] = np.dot(delta, activations[-2].transpose())\n",
	" \n",
	" # Note that the variable l in the loop below is used a little\n",
	" # differently to the notation in Chapter 2 of the book. Here,\n",
	" # l = 1 means the last layer of neurons, l = 2 is the\n",
	" # second-last layer, and so on. It's a renumbering of the\n",
	" # scheme in the book, used here to take advantage of the fact\n",
	" # that Python can use negative indices in lists.\n",
	" \n",
	" for l in range(2, self.num_layers): # Modified for Python 3\n",
	" z = zs[-l]\n",
	" sp = sigmoid_prime(z)\n",
	" delta = np.dot(self.weights[-l+1].transpose(), delta) * sp\n",
	" nabla_b[-l] = delta\n",
	" nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())\n",
	" return (nabla_b, nabla_w)\n",
	"\n",
	" \n",
	" def evaluate(self, test_data):\n",
	" \"\"\"Return the number of test inputs for which the neural\n",
	" network outputs the correct result. Note that the neural\n",
	" network's output is assumed to be the index of whichever\n",
	" neuron in the final layer has the highest activation.\"\"\"\n",
	" \n",
	" test_results = [(np.argmax(self.feedforward(x)), y)\n",
	" for (x, y) in test_data]\n",
	" return sum(int(x == y) for (x, y) in test_results)\n",
	"\n",
	" \n",
	" def cost_derivative(self, output_activations, y):\n",
	" \"\"\"Return the vector of partial derivatives \\partial C_x /\n",
	" \\partial a for the output activations.\"\"\"\n",
	" \n",
	" return (output_activations-y)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Miscellaneous functions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def sigmoid(z):\n",
	" \"\"\"The sigmoid function.\"\"\"\n",
	" return 1.0/(1.0+np.exp(-z))\n",
	"\n",
	"def sigmoid_prime(z):\n",
	" \"\"\"Derivative of the sigmoid function.\"\"\"\n",
	" return sigmoid(z)*(1-sigmoid(z))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Code to load the training data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def load_data(filename=None):\n",
	" \"\"\"Return the MNIST data as a tuple containing the training data,\n",
	" the validation data, and the test data.\n",
	"\n",
	" The ``training_data`` is returned as a tuple with two entries.\n",
	" The first entry contains the actual training images. This is a\n",
	" numpy ndarray with 50,000 entries. Each entry is, in turn, a\n",
	" numpy ndarray with 784 values, representing the 28 * 28 = 784\n",
	" pixels in a single MNIST image.\n",
	"\n",
	" The second entry in the ``training_data`` tuple is a numpy ndarray\n",
	" containing 50,000 entries. Those entries are just the digit\n",
	" values (0...9) for the corresponding images contained in the first\n",
	" entry of the tuple.\n",
	"\n",
	" The ``validation_data`` and ``test_data`` are similar, except\n",
	" each contains only 10,000 images.\n",
	"\n",
	" This is a nice data format, but for use in neural networks it's\n",
	" helpful to modify the format of the ``training_data`` a little.\n",
	" That's done in the wrapper function ``load_data_wrapper()``, see\n",
	" below.\n",
	" \"\"\"\n",
	" \n",
	" if not filename:\n",
	" filename = './neural-networks-and-deep-learning/data/mnist.pkl.gz'\n",
	" f = gzip.open(filename, 'rb')\n",
	" training_data, validation_data, test_data = pickle.load(f, encoding='latin1') # Encoding needed for Python 3\n",
	" f.close()\n",
	" return (training_data, validation_data, test_data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def load_data_wrapper(filename=None):\n",
	" \"\"\"Return a tuple containing ``(training_data, validation_data,\n",
	" test_data)``. Based on ``load_data``, but the format is more\n",
	" convenient for use in our implementation of neural networks.\n",
	"\n",
	" In particular, ``training_data`` is a list containing 50,000\n",
	" 2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray\n",
	" containing the input image. ``y`` is a 10-dimensional\n",
	" numpy.ndarray representing the unit vector corresponding to the\n",
	" correct digit for ``x``.\n",
	"\n",
	" ``validation_data`` and ``test_data`` are lists containing 10,000\n",
	" 2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional\n",
	" numpy.ndarry containing the input image, and ``y`` is the\n",
	" corresponding classification, i.e., the digit values (integers)\n",
	" corresponding to ``x``.\n",
	"\n",
	" Obviously, this means we're using slightly different formats for\n",
	" the training data and the validation / test data. These formats\n",
	" turn out to be the most convenient for use in our neural network\n",
	" code.\"\"\"\n",
	" \n",
	" tr_d, va_d, te_d = load_data(filename=filename)\n",
	" training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]\n",
	" training_results = [vectorized_result(y) for y in tr_d[1]]\n",
	" training_data = list(zip(training_inputs, training_results)) # Modified for Python 3\n",
	" validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]\n",
	" validation_data = list(zip(validation_inputs, va_d[1])) # Modified for Python 3\n",
	" test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]\n",
	" test_data = list(zip(test_inputs, te_d[1])) # Modified for Python 3\n",
	" return (training_data, validation_data, test_data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def vectorized_result(j):\n",
	" \"\"\"Return a 10-dimensional unit vector with a 1.0 in the jth\n",
	" position and zeroes elsewhere. This is used to convert a digit\n",
	" (0...9) into a corresponding desired output from the neural\n",
	" network.\"\"\"\n",
	" e = np.zeros((10, 1))\n",
	" e[j] = 1.0\n",
	" return e"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Train a network"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### First load the data\n",
	"Split data into training and test sets."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"training_data, validation_data, test_data = load_data_wrapper()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Initialize a network\n",
	"Hidden layer nodes = 30"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"net = Network([784, 30, 10])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Train and evaluate!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 0: 9092 / 10000\n",
	"Epoch 1: 9220 / 10000\n",
	"Epoch 2: 9314 / 10000\n",
	"Epoch 3: 9363 / 10000\n",
	"Epoch 4: 9407 / 10000\n",
	"Epoch 5: 9393 / 10000\n",
	"Epoch 6: 9447 / 10000\n",
	"Epoch 7: 9450 / 10000\n",
	"Epoch 8: 9460 / 10000\n",
	"Epoch 9: 9434 / 10000\n",
	"Epoch 10: 9465 / 10000\n",
	"Epoch 11: 9477 / 10000\n",
	"Epoch 12: 9478 / 10000\n",
	"Epoch 13: 9516 / 10000\n",
	"Epoch 14: 9487 / 10000\n",
	"Epoch 15: 9495 / 10000\n",
	"Epoch 16: 9498 / 10000\n",
	"Epoch 17: 9490 / 10000\n",
	"Epoch 18: 9504 / 10000\n",
	"Epoch 19: 9456 / 10000\n",
	"Epoch 20: 9505 / 10000\n",
	"Epoch 21: 9507 / 10000\n",
	"Epoch 22: 9526 / 10000\n",
	"Epoch 23: 9506 / 10000\n",
	"Epoch 24: 9521 / 10000\n",
	"Epoch 25: 9522 / 10000\n",
	"Epoch 26: 9531 / 10000\n",
	"Epoch 27: 9523 / 10000\n",
	"Epoch 28: 9522 / 10000\n",
	"Epoch 29: 9530 / 10000\n"
	]
	}
	],
	"source": [
	"net.SGD(training_data, 30, 10, 3.0, test_data=test_data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### A larger network\n",
	"Hidden layer nodes = 100"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 0: 6579 / 10000\n",
	"Epoch 1: 7591 / 10000\n",
	"Epoch 2: 7699 / 10000\n",
	"Epoch 3: 7785 / 10000\n",
	"Epoch 4: 7792 / 10000\n",
	"Epoch 5: 7992 / 10000\n",
	"Epoch 6: 8008 / 10000\n",
	"Epoch 7: 8020 / 10000\n",
	"Epoch 8: 7928 / 10000\n",
	"Epoch 9: 8244 / 10000\n",
	"Epoch 10: 8408 / 10000\n",
	"Epoch 11: 8173 / 10000\n",
	"Epoch 12: 8406 / 10000\n",
	"Epoch 13: 8750 / 10000\n",
	"Epoch 14: 9569 / 10000\n",
	"Epoch 15: 9583 / 10000\n",
	"Epoch 16: 9626 / 10000\n",
	"Epoch 17: 9653 / 10000\n",
	"Epoch 18: 9661 / 10000\n",
	"Epoch 19: 9634 / 10000\n",
	"Epoch 20: 9656 / 10000\n",
	"Epoch 21: 9642 / 10000\n",
	"Epoch 22: 9665 / 10000\n",
	"Epoch 23: 9674 / 10000\n",
	"Epoch 24: 9638 / 10000\n",
	"Epoch 25: 9642 / 10000\n",
	"Epoch 26: 9661 / 10000\n",
	"Epoch 27: 9663 / 10000\n",
	"Epoch 28: 9669 / 10000\n",
	"Epoch 29: 9686 / 10000\n"
	]
	}
	],
	"source": [
	"net = Network([784, 100, 10])\n",
	"net.SGD(training_data, 30, 10, 6.0, test_data=test_data)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}