tabrez · January 18, 2018 05:26
diff --git a/MNIST.ipynb b/MNIST.ipynb
 {
  "cells": [
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "import numpy as np\nimport pandas as pd\nfrom sklearn.datasets import fetch_mldata\n\nimport matplotlib\nimport matplotlib.pyplot as plt\nimport matplotlib.style as style\nstyle.use('bmh')\n%matplotlib inline\n\npd.options.display.max_rows = 14\n\nfrom IPython.core.interactiveshell import InteractiveShell\nInteractiveShell.ast_node_interactivity = \"all\"",
      "execution_count": 21,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Load and explore the dataset"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "mnist = fetch_mldata('MNIST original')\n# What's the structure of the object returned by sklearn?\nmnist",
      "execution_count": 22,
      "outputs": [
        {
          "data": {
            "text/plain": "{'COL_NAMES': ['label', 'data'],\n 'DESCR': 'mldata.org dataset: mnist-original',\n 'data': array([[0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        ..., \n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0],\n        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),\n 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}"
          },
          "execution_count": 22,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "* DESCR should contain more information about the dataset but unfortunately it most often doesn't\n* You should find out more information about the dataset on your own\n* features are available in `mnist.data`, labels are available in `mnist.target`"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "mnist.DESCR",
      "execution_count": 23,
      "outputs": [
        {
          "data": {
            "text/plain": "'mldata.org dataset: mnist-original'"
          },
          "execution_count": 23,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "X = mnist['data']\ny = mnist['target']\n\nX.shape, y.shape\nnp.sqrt(784)",
      "execution_count": 24,
      "outputs": [
        {
          "data": {
            "text/plain": "((70000, 784), (70000,))"
          },
          "execution_count": 24,
          "metadata": {},
          "output_type": "execute_result"
        },
        {
          "data": {
            "text/plain": "28.0"
          },
          "execution_count": 24,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "* There are 70,000 observations. 784 features/columns. Each observation has the image data in the form of 28x28 pixels per image. \n* To train, use 784 features as it is. \n* To print in image form, convert each observation(e.g. X[432] or X[766]) to 28x28 form using function `reshape`(e.g. X[22].reshape(28, 28))"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "d = X[34911]\nl = y[34911]\n\na = plt.imshow(d.reshape(28, 28), cmap = matplotlib.cm.binary)\na = plt.axis('off')\n\n# what's the label for the above observation?\n# l",
      "execution_count": 25,
      "outputs": [
        {
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPgAAAD1CAYAAAB9TzjVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAABl9JREFUeJzt3T9IlXscx3ENoYQwcLChqCGcGoQg\npKWhhiCHxsKGkiaNhpayVqGlhqgtCISWaghCiBxq6w8EUUO0VUvhEiFGgUTe+XJ5vt6rnns6n/N6\nrR+ecx7IN8/wu8+5vSsrKz1Apk3tvgGgdQQOwQQOwQQOwQQOwQQOwfpa/PnO4KD1epsGT3AIJnAI\nJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAI\nJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAI1tfuG2Bjff36\ntdzv379f7lNTU+U+PDxc7ufPn2/c9u3bV147Ojpa7vx3nuAQTOAQTOAQTOAQTOAQTOAQTOAQrHdl\nZaWVn9/SD+9WL168aNxmZmbKa+fn58t9tb+H3t7ecq8MDQ2V+/bt28t9cnKy3MfGxhq3nTt3ltd2\nuMZ/FE9wCCZwCCZwCCZwCCZwCCZwCOaY7A909+7dcp+YmGjclpeX1/XdrTwma7WRkZHG7dmzZ+W1\n/f39G307/yfHZNCNBA7BBA7BBA7BBA7BBA7BBA7B/GxyGywuLpb7rVu3yn29Z92V6enpct+0qTOf\nCX/y+X0rdea/FvCvCByCCRyCCRyCCRyCCRyCCRyCeR+8BZ48eVLup0+fLvcvX76s+bsHBgbK/eDB\ng+X+8OHDNX83beN9cOhGAodgAodgAodgAodgAodgAodg3gdv8Pv378bt9u3b5bWr7es55+7p6enZ\nunVr43b9+vXy2lOnTq3ru+ksnuAQTOAQTOAQTOAQTOAQTOAQTOAQzPvgDarfLh8cHPwf7+Sf9u7d\n27jNzc2V1+7evXujb4f28z44dCOBQzCBQzCBQzCBQzCBQzCvi3agd+/eNW6zs7PltcPDw+U+Pj6+\nllviD+UJDsEEDsEEDsEEDsEEDsEEDsEEDsG8Ltrg169fjdvLly/La6tz6p6enp6bN2+W+/v378u9\nlS5evFjuZ86cKfc9e/Zs5O3w73hdFLqRwCGYwCGYwCGYwCGYwCGYwCGYc/A2WFpaKvfjx4+X+/z8\n/Ebezt+s9vewY8eOcn/w4EHjtn///jXdE6tyDg7dSOAQTOAQTOAQTOAQTOAQTOAQzDl4B3r16lXj\nduzYsfLahYWFcl/t76G3t/HIdVXT09PlfuXKlTV/dpdzDg7dSOAQTOAQTOAQTOAQTOAQTOAQzDl4\nmA8fPpT7jx8/yv3s2bPl/ubNm3L//v174zY0NFRee/jw4XK/ceNGuQ8ODpZ7MOfg0I0EDsEEDsEE\nDsEEDsEEDsEck/GfrPa/Pr5w4ULjtry8vK7vnpubK/ejR4+u6/M7mGMy6EYCh2ACh2ACh2ACh2AC\nh2ACh2B97b4BOsu5c+fKfdu2bY3bxMTERt8Oq/AEh2ACh2ACh2ACh2ACh2ACh2ACh2DeB2+Dz58/\nl/vVq1fL/dq1a41bX197/9OGxcXFxm29P2s8MjJS7q9fv17X53cw74NDNxI4BBM4BBM4BBM4BBM4\nBBM4BPM+eBusdh68sLBQ7jMzM43b5cuXy2s3b95c7n+yb9++tfsWOo4nOAQTOAQTOAQTOAQTOAQT\nOARzTNYG/f395T45OVnuY2NjjdunT5/Ka6empsp9dHS03FezZcuWxu3IkSPltfPz8+v6bv7JExyC\nCRyCCRyCCRyCCRyCCRyCCRyC+dnkDvT06dPG7eTJk+W1S0tL5X7o0KFyP3HiRLnPzs42bs+fPy+v\n/fnzZ7nv2rWr3D9+/FjuwfxsMnQjgUMwgUMwgUMwgUMwgUMwgUMw5+BhHj9+XO6XLl0q97dv35Z7\nb2/jkeu6DQwMlPuBAwfK/dGjRxt5O53EOTh0I4FDMIFDMIFDMIFDMIFDMIFDMOfg/M34+Hi537t3\nr2XffefOnXJf7d66mHNw6EYCh2ACh2ACh2ACh2ACh2ACh2DOwaHzOQeHbiRwCCZwCCZwCCZwCCZw\nCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCNbX4s9v/DlX\noPU8wSGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGY\nwCGYwCHYX+9cCijqr4YBAAAAAElFTkSuQmCC\n",
            "text/plain": "<matplotlib.figure.Figure at 0x7f0df7d06d68>"
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ]
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "# Split the set: first 60k observations into training set, remaining 10k observations into test set\nX_train, y_train = X[:60000], y[:60000]\nX_test, y_test  = X[60000:], y[60000:]\n\n# Shuffle the training set. \n# First compute indices in random order so it can be used on both X_train and y_train; may not be needed if both are in the same dataset\nshuffle_index = np.random.permutation(60000)\nX_train, y_train = X_train[shuffle_index], y_train[shuffle_index]",
      "execution_count": 26,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Binary Classifier: Is the digit 5 or not?"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "# Let us create a simple binary classifier that can classify digits 0-9 as two categories: `5` or `not 5`(hence the name)\nfrom sklearn.linear_model import SGDClassifier\n\n# whereever label is 5, set that label to `True` instead\n# wherever the label is 0,1,2,3,4,6,7,8,9, set that label to `False` instead\ny_train_5 = (y_train == 5) \ny_test_5 = (y_test == 5)   ",
      "execution_count": 27,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "sgd_clf = SGDClassifier(random_state=42, max_iter=5)\nx = sgd_clf.fit(X_train, y_train_5)  # train!",
      "execution_count": 28,
      "outputs": [
        {
          "data": {
            "text/plain": "SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n       eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,\n       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,\n       tol=None, verbose=0, warm_start=False)"
          },
          "execution_count": 28,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "# predict on a sample observation and check with its label\nsgd_clf.predict([sample_digit])\nl\n# Exercise: try out predict() on a few other training observations",
      "execution_count": 34,
      "outputs": [
        {
          "data": {
            "text/plain": "array([ True], dtype=bool)"
          },
          "execution_count": 34,
          "metadata": {},
          "output_type": "execute_result"
        },
        {
          "data": {
            "text/plain": "5.0"
          },
          "execution_count": 34,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Performance Measures"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "### Calculate accuracy of classification using k-fold cross-validation\nfrom sklearn.model_selection import cross_val_score\ncross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring='accuracy')",
      "execution_count": 30,
      "outputs": [
        {
          "data": {
            "text/plain": "array([ 0.9645 ,  0.9656 ,  0.94645])"
          },
          "execution_count": 30,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "* Is ~94-96% accuracy good? Let us see the accuracy of one of the worst classifiers possible.\n* What's the accuracy of the classifier that classifies every number as not being 5\n\n* The following classifier does no training at all. Whenever predict is called, it just returns all 0s in the shape of X.\n* Do you understand how reshaping is being done here and why?\n* If you forgot how required shape can be passed to `np.zeros`: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.zeros.html"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.base import BaseEstimator\nclass Never5Classifier(BaseEstimator):\n    def fit(self, X, y=None):\n        pass\n    def predict(self, X):\n        return np.zeros((len(X), 1), dtype=bool)",
      "execution_count": 31,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "### Calculate accuracy for this dumb classifier\ncross_val_score(Never5Classifier(), X_train, y_train_5, cv=3, scoring='accuracy')",
      "execution_count": 33,
      "outputs": [
        {
          "data": {
            "text/plain": "array([ 0.9124 ,  0.9055 ,  0.91105])"
          },
          "execution_count": 33,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "* That's ~90-91% accuracy for this dataset when you always predict `not 5` as the result where the learning part is entirely skipped! \n* Our previous classifier was only ~4% more accurate than one of the worst possible classifiers. \n* Before we think of better classifier models, let us first learn better ways to measure performance  \n\n### A. Confusion Matrix\n\n* Confusion matrix tells us the number of times category A was classified incorrectly as category B, C, D, etc. and similarly for B, C, D, etc.\n* E.g. 3rd row and 4th column in the confusion matrix tells us how many times the classifier incorrectyl classified the images of 3 with images of 5.\n* To create a confusion matrix we need actual labels and predicted labels. use `cross_val_predict` function in this case to get predictions instead of the scores.\n\n### cross_val_predict vs. cross_val_score & predict\n* Exercise: Repeat the following with Never5Classifier if you want to even though you know already what that confusion matrix would look like\n* `predict` function gives a prediction after the corresponding label has been seen and trained on already\n* Predictions obtained via `cross_val_predict` are generated using cross validation technique\n* This means that the predictions were generated without looking at the training labels\n* How is this possible?"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.model_selection import cross_val_predict\nfrom sklearn.metrics import confusion_matrix\n\ny_ps = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) \nconfusion_matrix(y_train_5, y_ps)",
      "execution_count": 36,
      "outputs": [
        {
          "data": {
            "text/plain": "array([[53553,  1026],\n       [ 1443,  3978]])"
          },
          "execution_count": 36,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "|  | not 5 | 5 |\n|:-:|:-:|:-:|\n| not 5 |  |   |\n        | 5 |  | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  |\n\n* How to interpret the above result?\n  * Rows = actual class & columns = predicted class\n* This means:\n  * ~50k of the digits that are not digit 5 were correctly classified as `not 5`. These are called as _true negatives_.\n  * ~1k of the digits that are not digit 5 were incorrectly classified as `5`. These are called _false positives_. \n  * ~1.5k of the digits that are digit 5 were incorrectly classified as `not 5`. These are called _false negatives_.\n  * ~4k of the digits that are digit 5 were correctly classified as `5`. These are called _true positives_."
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "confusion_matrix(y_train_5, y_train_5)",
      "execution_count": 37,
      "outputs": [
        {
          "data": {
            "text/plain": "array([[54579,     0],\n       [    0,  5421]])"
          },
          "execution_count": 37,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### B. Precision & Recall\n\n$$ Precision = \\frac{TP}{TP + FP} $$\n\n$ $\n\n$$ Recall = \\frac{TP}{TP + FN} $$\n\n$ $\nWhere: \n\n$ $\nTP = True positives, FP = False positives, FN = False negatives"
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.metrics import precision_score, recall_score\nprecision_score(y_train_5, y_ps)\nrecall_score(y_train_5, y_ps)",
      "execution_count": 38,
      "outputs": [
        {
          "data": {
            "text/plain": "0.79496402877697847"
          },
          "execution_count": 38,
          "metadata": {},
          "output_type": "execute_result"
        },
        {
          "data": {
            "text/plain": "0.73381294964028776"
          },
          "execution_count": 38,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### What do these values mean?\n\n* When our SGDClassifier based model predicts a `5`, the accuracy is ~80%.\n* It detects ~73% of 5s.\n* Precision and recall measures can be combined into one measure called an F1 score:\n\n$ $\n$$ F_1 = \\frac{TP}{TP + \\frac{FN+FP}{2}} $$\n\n$ $\n* F1 prefers classifiers that have similar Precision and Recall scores. Sometimes you want classifiers that have high precision or high recall;\n  In those cases you still use precision and recall scores as performance measures. Give examples."
    },
    {
      "metadata": {
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.metrics import f1_score\nf1_score(y_train_5, y_ps)",
      "execution_count": 40,
      "outputs": [
        {
          "data": {
            "text/plain": "0.76316546762589932"
          },
          "execution_count": 40,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "How to get high precision as well as high recall? That's not possible because as one increases the other one decreases and vice-versa.\n### Precision/Recall trade-off"
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.3",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "gist": {
      "id": "",
      "data": {
        "description": "MNIST.ipynb",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "import numpy as np\nimport pandas as pd\nfrom sklearn.datasets import fetch_mldata\n\nimport matplotlib\nimport matplotlib.pyplot as plt\nimport matplotlib.style as style\nstyle.use('bmh')\n%matplotlib inline\n\npd.options.display.max_rows = 14\n\nfrom IPython.core.interactiveshell import InteractiveShell\nInteractiveShell.ast_node_interactivity = \"all\"",
	"execution_count": 21,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "# Load and explore the dataset"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "mnist = fetch_mldata('MNIST original')\n# What's the structure of the object returned by sklearn?\nmnist",
	"execution_count": 22,
	"outputs": [
	{
	"data": {
	"text/plain": "{'COL_NAMES': ['label', 'data'],\n 'DESCR': 'mldata.org dataset: mnist-original',\n 'data': array([[0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0],\n ..., \n [0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0],\n [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),\n 'target': array([ 0., 0., 0., ..., 9., 9., 9.])}"
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "* DESCR should contain more information about the dataset but unfortunately it most often doesn't\n* You should find out more information about the dataset on your own\n* features are available in `mnist.data`, labels are available in `mnist.target`"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "mnist.DESCR",
	"execution_count": 23,
	"outputs": [
	{
	"data": {
	"text/plain": "'mldata.org dataset: mnist-original'"
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "X = mnist['data']\ny = mnist['target']\n\nX.shape, y.shape\nnp.sqrt(784)",
	"execution_count": 24,
	"outputs": [
	{
	"data": {
	"text/plain": "((70000, 784), (70000,))"
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": "28.0"
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "* There are 70,000 observations. 784 features/columns. Each observation has the image data in the form of 28x28 pixels per image. \n* To train, use 784 features as it is. \n* To print in image form, convert each observation(e.g. X[432] or X[766]) to 28x28 form using function `reshape`(e.g. X[22].reshape(28, 28))"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "d = X[34911]\nl = y[34911]\n\na = plt.imshow(d.reshape(28, 28), cmap = matplotlib.cm.binary)\na = plt.axis('off')\n\n# what's the label for the above observation?\n# l",
	"execution_count": 25,
	"outputs": [
	{
	"data": {
	"image/png": "iVBORw0KGgoAAAANSUhEUgAAAPgAAAD1CAYAAAB9TzjVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAABl9JREFUeJzt3T9IlXscx3ENoYQwcLChqCGcGoQg\npKWhhiCHxsKGkiaNhpayVqGlhqgtCISWaghCiBxq6w8EUUO0VUvhEiFGgUTe+XJ5vt6rnns6n/N6\nrR+ecx7IN8/wu8+5vSsrKz1Apk3tvgGgdQQOwQQOwQQOwQQOwQQOwfpa/PnO4KD1epsGT3AIJnAI\nJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAI\nJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAIJnAI1tfuG2Bjff36\ntdzv379f7lNTU+U+PDxc7ufPn2/c9u3bV147Ojpa7vx3nuAQTOAQTOAQTOAQTOAQTOAQTOAQrHdl\nZaWVn9/SD+9WL168aNxmZmbKa+fn58t9tb+H3t7ecq8MDQ2V+/bt28t9cnKy3MfGxhq3nTt3ltd2\nuMZ/FE9wCCZwCCZwCCZwCCZwCCZwCOaY7A909+7dcp+YmGjclpeX1/XdrTwma7WRkZHG7dmzZ+W1\n/f39G307/yfHZNCNBA7BBA7BBA7BBA7BBA7BBA7B/GxyGywuLpb7rVu3yn29Z92V6enpct+0qTOf\nCX/y+X0rdea/FvCvCByCCRyCCRyCCRyCCRyCCRyCeR+8BZ48eVLup0+fLvcvX76s+bsHBgbK/eDB\ng+X+8OHDNX83beN9cOhGAodgAodgAodgAodgAodgAodg3gdv8Pv378bt9u3b5bWr7es55+7p6enZ\nunVr43b9+vXy2lOnTq3ru+ksnuAQTOAQTOAQTOAQTOAQTOAQTOAQzPvgDarfLh8cHPwf7+Sf9u7d\n27jNzc2V1+7evXujb4f28z44dCOBQzCBQzCBQzCBQzCBQzCvi3agd+/eNW6zs7PltcPDw+U+Pj6+\nllviD+UJDsEEDsEEDsEEDsEEDsEEDsEEDsG8Ltrg169fjdvLly/La6tz6p6enp6bN2+W+/v378u9\nlS5evFjuZ86cKfc9e/Zs5O3w73hdFLqRwCGYwCGYwCGYwCGYwCGYwCGYc/A2WFpaKvfjx4+X+/z8\n/Ebezt+s9vewY8eOcn/w4EHjtn///jXdE6tyDg7dSOAQTOAQTOAQTOAQTOAQTOAQzDl4B3r16lXj\nduzYsfLahYWFcl/t76G3t/HIdVXT09PlfuXKlTV/dpdzDg7dSOAQTOAQTOAQTOAQTOAQTOAQzDl4\nmA8fPpT7jx8/yv3s2bPl/ubNm3L//v174zY0NFRee/jw4XK/ceNGuQ8ODpZ7MOfg0I0EDsEEDsEE\nDsEEDsEEDsEck/GfrPa/Pr5w4ULjtry8vK7vnpubK/ejR4+u6/M7mGMy6EYCh2ACh2ACh2ACh2AC\nh2ACh2B97b4BOsu5c+fKfdu2bY3bxMTERt8Oq/AEh2ACh2ACh2ACh2ACh2ACh2ACh2DeB2+Dz58/\nl/vVq1fL/dq1a41bX197/9OGxcXFxm29P2s8MjJS7q9fv17X53cw74NDNxI4BBM4BBM4BBM4BBM4\nBBM4BPM+eBusdh68sLBQ7jMzM43b5cuXy2s3b95c7n+yb9++tfsWOo4nOAQTOAQTOAQTOAQTOAQT\nOARzTNYG/f395T45OVnuY2NjjdunT5/Ka6empsp9dHS03FezZcuWxu3IkSPltfPz8+v6bv7JExyC\nCRyCCRyCCRyCCRyCCRyCCRyC+dnkDvT06dPG7eTJk+W1S0tL5X7o0KFyP3HiRLnPzs42bs+fPy+v\n/fnzZ7nv2rWr3D9+/FjuwfxsMnQjgUMwgUMwgUMwgUMwgUMwgUMw5+BhHj9+XO6XLl0q97dv35Z7\nb2/jkeu6DQwMlPuBAwfK/dGjRxt5O53EOTh0I4FDMIFDMIFDMIFDMIFDMIFDMOfg/M34+Hi537t3\nr2XffefOnXJf7d66mHNw6EYCh2ACh2ACh2ACh2ACh2ACh2DOwaHzOQeHbiRwCCZwCCZwCCZwCCZw\nCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCCZwCNbX4s9v/DlX\noPU8wSGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGYwCGY\nwCGYwCHYX+9cCijqr4YBAAAAAElFTkSuQmCC\n",
	"text/plain": "<matplotlib.figure.Figure at 0x7f0df7d06d68>"
	},
	"metadata": {},
	"output_type": "display_data"
	}
	]
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "# Split the set: first 60k observations into training set, remaining 10k observations into test set\nX_train, y_train = X[:60000], y[:60000]\nX_test, y_test = X[60000:], y[60000:]\n\n# Shuffle the training set. \n# First compute indices in random order so it can be used on both X_train and y_train; may not be needed if both are in the same dataset\nshuffle_index = np.random.permutation(60000)\nX_train, y_train = X_train[shuffle_index], y_train[shuffle_index]",
	"execution_count": 26,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "# Binary Classifier: Is the digit 5 or not?"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "# Let us create a simple binary classifier that can classify digits 0-9 as two categories: `5` or `not 5`(hence the name)\nfrom sklearn.linear_model import SGDClassifier\n\n# whereever label is 5, set that label to `True` instead\n# wherever the label is 0,1,2,3,4,6,7,8,9, set that label to `False` instead\ny_train_5 = (y_train == 5) \ny_test_5 = (y_test == 5) ",
	"execution_count": 27,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "sgd_clf = SGDClassifier(random_state=42, max_iter=5)\nx = sgd_clf.fit(X_train, y_train_5) # train!",
	"execution_count": 28,
	"outputs": [
	{
	"data": {
	"text/plain": "SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,\n n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,\n tol=None, verbose=0, warm_start=False)"
	},
	"execution_count": 28,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "# predict on a sample observation and check with its label\nsgd_clf.predict([sample_digit])\nl\n# Exercise: try out predict() on a few other training observations",
	"execution_count": 34,
	"outputs": [
	{
	"data": {
	"text/plain": "array([ True], dtype=bool)"
	},
	"execution_count": 34,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": "5.0"
	},
	"execution_count": 34,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "# Performance Measures"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "### Calculate accuracy of classification using k-fold cross-validation\nfrom sklearn.model_selection import cross_val_score\ncross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring='accuracy')",
	"execution_count": 30,
	"outputs": [
	{
	"data": {
	"text/plain": "array([ 0.9645 , 0.9656 , 0.94645])"
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "* Is ~94-96% accuracy good? Let us see the accuracy of one of the worst classifiers possible.\n* What's the accuracy of the classifier that classifies every number as not being 5\n\n* The following classifier does no training at all. Whenever predict is called, it just returns all 0s in the shape of X.\n* Do you understand how reshaping is being done here and why?\n* If you forgot how required shape can be passed to `np.zeros`: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.zeros.html"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "from sklearn.base import BaseEstimator\nclass Never5Classifier(BaseEstimator):\n def fit(self, X, y=None):\n pass\n def predict(self, X):\n return np.zeros((len(X), 1), dtype=bool)",
	"execution_count": 31,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "### Calculate accuracy for this dumb classifier\ncross_val_score(Never5Classifier(), X_train, y_train_5, cv=3, scoring='accuracy')",
	"execution_count": 33,
	"outputs": [
	{
	"data": {
	"text/plain": "array([ 0.9124 , 0.9055 , 0.91105])"
	},
	"execution_count": 33,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "* That's ~90-91% accuracy for this dataset when you always predict `not 5` as the result where the learning part is entirely skipped! \n* Our previous classifier was only ~4% more accurate than one of the worst possible classifiers. \n* Before we think of better classifier models, let us first learn better ways to measure performance \n\n### A. Confusion Matrix\n\n* Confusion matrix tells us the number of times category A was classified incorrectly as category B, C, D, etc. and similarly for B, C, D, etc.\n* E.g. 3rd row and 4th column in the confusion matrix tells us how many times the classifier incorrectyl classified the images of 3 with images of 5.\n* To create a confusion matrix we need actual labels and predicted labels. use `cross_val_predict` function in this case to get predictions instead of the scores.\n\n### cross_val_predict vs. cross_val_score & predict\n* Exercise: Repeat the following with Never5Classifier if you want to even though you know already what that confusion matrix would look like\n* `predict` function gives a prediction after the corresponding label has been seen and trained on already\n* Predictions obtained via `cross_val_predict` are generated using cross validation technique\n* This means that the predictions were generated without looking at the training labels\n* How is this possible?"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "from sklearn.model_selection import cross_val_predict\nfrom sklearn.metrics import confusion_matrix\n\ny_ps = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) \nconfusion_matrix(y_train_5, y_ps)",
	"execution_count": 36,
	"outputs": [
	{
	"data": {
	"text/plain": "array([[53553, 1026],\n [ 1443, 3978]])"
	},
	"execution_count": 36,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "\| \| not 5 \| 5 \|\n\|:-:\|:-:\|:-:\|\n\| not 5 \| \| \|\n \| 5 \| \|       \|\n\n* How to interpret the above result?\n * Rows = actual class & columns = predicted class\n* This means:\n * ~50k of the digits that are not digit 5 were correctly classified as `not 5`. These are called as _true negatives_.\n * ~1k of the digits that are not digit 5 were incorrectly classified as `5`. These are called _false positives_. \n * ~1.5k of the digits that are digit 5 were incorrectly classified as `not 5`. These are called _false negatives_.\n * ~4k of the digits that are digit 5 were correctly classified as `5`. These are called _true positives_."
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "confusion_matrix(y_train_5, y_train_5)",
	"execution_count": 37,
	"outputs": [
	{
	"data": {
	"text/plain": "array([[54579, 0],\n [ 0, 5421]])"
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "### B. Precision & Recall\n\n$$ Precision = \\frac{TP}{TP + FP} $$\n\n$ $\n\n$$ Recall = \\frac{TP}{TP + FN} $$\n\n$ $\nWhere: \n\n$ $\nTP = True positives, FP = False positives, FN = False negatives"
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "from sklearn.metrics import precision_score, recall_score\nprecision_score(y_train_5, y_ps)\nrecall_score(y_train_5, y_ps)",
	"execution_count": 38,
	"outputs": [
	{
	"data": {
	"text/plain": "0.79496402877697847"
	},
	"execution_count": 38,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": "0.73381294964028776"
	},
	"execution_count": 38,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "### What do these values mean?\n\n* When our SGDClassifier based model predicts a `5`, the accuracy is ~80%.\n* It detects ~73% of 5s.\n* Precision and recall measures can be combined into one measure called an F1 score:\n\n$ $\n$$ F_1 = \\frac{TP}{TP + \\frac{FN+FP}{2}} $$\n\n$ $\n* F1 prefers classifiers that have similar Precision and Recall scores. Sometimes you want classifiers that have high precision or high recall;\n In those cases you still use precision and recall scores as performance measures. Give examples."
	},
	{
	"metadata": {
	"trusted": false
	},
	"cell_type": "code",
	"source": "from sklearn.metrics import f1_score\nf1_score(y_train_5, y_ps)",
	"execution_count": 40,
	"outputs": [
	{
	"data": {
	"text/plain": "0.76316546762589932"
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "How to get high precision as well as high recall? That's not possible because as one increases the other one decreases and vice-versa.\n### Precision/Recall trade-off"
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.6.3",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"gist": {
	"id": "",
	"data": {
	"description": "MNIST.ipynb",
	"public": true
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
No results found