brianspiering · November 8, 2019 16:13
diff --git a/missing_data_tree_ensemble.ipynb b/missing_data_tree_ensemble.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Explore-how-different-tree-based-ensemble-methods-handle-missing-values\" data-toc-modified-id=\"Explore-how-different-tree-based-ensemble-methods-handle-missing-values-1\">Explore how different tree-based ensemble methods handle missing values</a></span></li><li><span><a href=\"#Missing-Values-in-Training\" data-toc-modified-id=\"Missing-Values-in-Training-2\">Missing Values in Training</a></span></li><li><span><a href=\"#Missing-Values-in-Test\" data-toc-modified-id=\"Missing-Values-in-Test-3\">Missing Values in Test</a></span></li><li><span><a href=\"#Summary\" data-toc-modified-id=\"Summary-4\">Summary</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Explore how different tree-based ensemble methods handle missing values\n",
    "-----"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "reset -fs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use the iris data for simplicity\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, y = load_iris(return_X_y=True)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Missing Values in Training\n",
    "--------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's replace a single actual value with a missing value\n",
    "X_train[0][0] = np.nan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Try Random Forest™__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Input contains NaN, infinity or a value too large for dtype('float32').",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-63-71f4c191ff7c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m (RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n\u001b[0;32m----> 2\u001b[0;31m  \u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m  .score(X_train, y_train))\n",
      "\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/ensemble/forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    248\u001b[0m         \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDTYPE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    250\u001b[0m         \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    251\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m    540\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    541\u001b[0m             _assert_all_finite(array,\n\u001b[0;32m--> 542\u001b[0;31m                                allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m    543\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    544\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mensure_min_samples\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan)\u001b[0m\n\u001b[1;32m     54\u001b[0m                 not allow_nan and not np.isfinite(X).all()):\n\u001b[1;32m     55\u001b[0m             \u001b[0mtype_err\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infinity'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mallow_nan\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'NaN, infinity'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_err\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     57\u001b[0m     \u001b[0;31m# for object dtype data, we only check for NaNs (GH-13254)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     58\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'object'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mallow_nan\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float32')."
     ]
    }
   ],
   "source": [
    "(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
    " .fit(X_train, y_train)\n",
    " .score(X_train, y_train))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> __Missing value replacement for the training set__\n",
    "\n",
    "> Random forests has two ways of replacing missing values. The first way is fast. If the mth variable is not categorical, the method computes the median of all values of this variable in class j, then it uses this value to replace all missing values of the mth variable in class j. If the mth variable is categorical, the replacement is the most frequent non-missing value in class j. These replacement values are called fills.\n",
    "\n",
    "> The second way of replacing missing values is computationally more expensive but has given better performance than the first, even with large amounts of missing data. It replaces missing values only in the training set. It begins by doing a rough and inaccurate filling in of the missing values. Then it does a forest run and computes proximities.\n",
    "\n",
    "> If x(m,n) is a missing continuous value, estimate its fill as an average over the non-missing values of the mth variables weighted by the proximities between the nth case and the non-missing value case. If it is a missing categorical variable, replace it by the most frequent non-missing value where frequency is weighted by proximity.\n",
    "\n",
    "> Now iterate-construct a forest again using these newly filled in values, find new fills and iterate again. Our experience is that 4-6 iterations are enough.\n",
    "\n",
    "https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#missing1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9809523809523809"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "# Create our imputer to replace missing values with the median\n",
    "imp = SimpleImputer(missing_values=np.nan, strategy='median')\n",
    "imp = imp.fit(X_train)\n",
    "\n",
    "# Impute our data, then train\n",
    "X_train_imp = imp.transform(X_train)\n",
    "\n",
    "(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
    " .fit(X_train_imp, y_train)\n",
    " .score(X_train_imp, y_train))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Try XGBoost__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: xgboost in /Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages (0.90)\r\n",
      "Requirement already satisfied: numpy in /Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages (from xgboost) (1.17.3)\r\n",
      "Requirement already satisfied: scipy in /Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages (from xgboost) (1.3.1)\r\n"
     ]
    }
   ],
   "source": [
    "! pip install xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(XGBClassifier()\n",
    ".fit(X_train, y_train)\n",
    ".score(X_train, y_train))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Missing Values in Test\n",
    "--------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's replace a single value\n",
    "X_test[0][0] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Input contains NaN, infinity or a value too large for dtype('float32').",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-59-10bcf716e49b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# scikit-learn's Random Forest™ completely breaks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m (RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n\u001b[0;32m----> 3\u001b[0;31m  \u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m  .score(X_test, y_test))\n",
      "\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/ensemble/forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    248\u001b[0m         \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDTYPE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    250\u001b[0m         \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    251\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m    540\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    541\u001b[0m             _assert_all_finite(array,\n\u001b[0;32m--> 542\u001b[0;31m                                allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m    543\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    544\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mensure_min_samples\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan)\u001b[0m\n\u001b[1;32m     54\u001b[0m                 not allow_nan and not np.isfinite(X).all()):\n\u001b[1;32m     55\u001b[0m             \u001b[0mtype_err\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infinity'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mallow_nan\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'NaN, infinity'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_err\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     57\u001b[0m     \u001b[0;31m# for object dtype data, we only check for NaNs (GH-13254)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     58\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'object'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mallow_nan\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float32')."
     ]
    }
   ],
   "source": [
    "# scikit-learn's Random Forest™ completely breaks\n",
    "(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
    " .fit(X_train, y_train)\n",
    " .score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9555555555555556"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create our imputer to replace missing values with the median\n",
    "imp = SimpleImputer(missing_values=np.nan, strategy='median')\n",
    "imp = imp.fit(X_train)\n",
    "\n",
    "# Impute our data\n",
    "X_train_imp = imp.transform(X_train)\n",
    "X_test_imp = imp.transform(X_test)\n",
    "\n",
    "(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
    " .fit(X_train_imp, y_train)\n",
    " .score(X_test_imp, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> __Missing value replacement for the test set__\n",
    "\n",
    "When there is a test set, there are two different methods of replacement depending on whether labels exist for the test set.\n",
    "\n",
    "If they do, then the fills derived from the training set are used as replacements. If labels no not exist, then each case in the test set is replicated nclass times (nclass= number of classes). The first replicate of a case is assumed to be class 1 and the class one fills used to replace missing values. The 2nd replicate is assumed class 2 and the class 2 fills used on it.\n",
    "\n",
    "This augmented test set is run down the tree. In each set of replicates, the one receiving the most votes determines the class of the original case.\n",
    "\n",
    "https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#missing2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9555555555555556"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# XGBoost call handle it\n",
    "(XGBClassifier()\n",
    ".fit(X_train, y_train)\n",
    ".score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Summary\n",
    "-----\n",
    "\n",
    "Random Forest™ needs to impute missing values.\n",
    "\n",
    "XGBoost can handle missing values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": false,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"toc": true
	},
	"source": [
	"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
	"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Explore-how-different-tree-based-ensemble-methods-handle-missing-values\" data-toc-modified-id=\"Explore-how-different-tree-based-ensemble-methods-handle-missing-values-1\">Explore how different tree-based ensemble methods handle missing values</a></span></li><li><span><a href=\"#Missing-Values-in-Training\" data-toc-modified-id=\"Missing-Values-in-Training-2\">Missing Values in Training</a></span></li><li><span><a href=\"#Missing-Values-in-Test\" data-toc-modified-id=\"Missing-Values-in-Test-3\">Missing Values in Test</a></span></li><li><span><a href=\"#Summary\" data-toc-modified-id=\"Summary-4\">Summary</a></span></li></ul></div>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Explore how different tree-based ensemble methods handle missing values\n",
	"-----"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {},
	"outputs": [],
	"source": [
	"reset -fs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Use the iris data for simplicity\n",
	"from sklearn.datasets import load_iris\n",
	"from sklearn.model_selection import train_test_split"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {},
	"outputs": [],
	"source": [
	"X, y = load_iris(return_X_y=True)\n",
	"\n",
	"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Missing Values in Training\n",
	"--------"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Let's replace a single actual value with a missing value\n",
	"X_train[0][0] = np.nan"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"__Try Random Forest™__"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.ensemble import RandomForestClassifier"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 63,
	"metadata": {},
	"outputs": [
	{
	"ename": "ValueError",
	"evalue": "Input contains NaN, infinity or a value too large for dtype('float32').",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-63-71f4c191ff7c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m (RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m .score(X_train, y_train))\n",
	"\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/ensemble/forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDTYPE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 541\u001b[0m _assert_all_finite(array,\n\u001b[0;32m--> 542\u001b[0;31m allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m 543\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 544\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mensure_min_samples\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan)\u001b[0m\n\u001b[1;32m 54\u001b[0m not allow_nan and not np.isfinite(X).all()):\n\u001b[1;32m 55\u001b[0m \u001b[0mtype_err\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infinity'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mallow_nan\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'NaN, infinity'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_err\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;31m# for object dtype data, we only check for NaNs (GH-13254)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'object'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mallow_nan\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float32')."
	]
	}
	],
	"source": [
	"(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
	" .fit(X_train, y_train)\n",
	" .score(X_train, y_train))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"> __Missing value replacement for the training set__\n",
	"\n",
	"> Random forests has two ways of replacing missing values. The first way is fast. If the mth variable is not categorical, the method computes the median of all values of this variable in class j, then it uses this value to replace all missing values of the mth variable in class j. If the mth variable is categorical, the replacement is the most frequent non-missing value in class j. These replacement values are called fills.\n",
	"\n",
	"> The second way of replacing missing values is computationally more expensive but has given better performance than the first, even with large amounts of missing data. It replaces missing values only in the training set. It begins by doing a rough and inaccurate filling in of the missing values. Then it does a forest run and computes proximities.\n",
	"\n",
	"> If x(m,n) is a missing continuous value, estimate its fill as an average over the non-missing values of the mth variables weighted by the proximities between the nth case and the non-missing value case. If it is a missing categorical variable, replace it by the most frequent non-missing value where frequency is weighted by proximity.\n",
	"\n",
	"> Now iterate-construct a forest again using these newly filled in values, find new fills and iterate again. Our experience is that 4-6 iterations are enough.\n",
	"\n",
	"https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#missing1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 54,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.9809523809523809"
	]
	},
	"execution_count": 54,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.impute import SimpleImputer\n",
	"\n",
	"# Create our imputer to replace missing values with the median\n",
	"imp = SimpleImputer(missing_values=np.nan, strategy='median')\n",
	"imp = imp.fit(X_train)\n",
	"\n",
	"# Impute our data, then train\n",
	"X_train_imp = imp.transform(X_train)\n",
	"\n",
	"(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
	" .fit(X_train_imp, y_train)\n",
	" .score(X_train_imp, y_train))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"__Try XGBoost__"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Requirement already satisfied: xgboost in /Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages (0.90)\r\n",
	"Requirement already satisfied: numpy in /Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages (from xgboost) (1.17.3)\r\n",
	"Requirement already satisfied: scipy in /Users/brian/anaconda3/envs/3.7/lib/python3.7/site-packages (from xgboost) (1.3.1)\r\n"
	]
	}
	],
	"source": [
	"! pip install xgboost"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"metadata": {},
	"outputs": [],
	"source": [
	"from xgboost import XGBClassifier"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 57,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1.0"
	]
	},
	"execution_count": 57,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"(XGBClassifier()\n",
	".fit(X_train, y_train)\n",
	".score(X_train, y_train))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Missing Values in Test\n",
	"--------"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 58,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Let's replace a single value\n",
	"X_test[0][0] = np.nan"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 59,
	"metadata": {},
	"outputs": [
	{
	"ename": "ValueError",
	"evalue": "Input contains NaN, infinity or a value too large for dtype('float32').",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-59-10bcf716e49b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# scikit-learn's Random Forest™ completely breaks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m (RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m .score(X_test, y_test))\n",
	"\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/ensemble/forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDTYPE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 541\u001b[0m _assert_all_finite(array,\n\u001b[0;32m--> 542\u001b[0;31m allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m 543\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 544\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mensure_min_samples\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m~/anaconda3/envs/3.7/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan)\u001b[0m\n\u001b[1;32m 54\u001b[0m not allow_nan and not np.isfinite(X).all()):\n\u001b[1;32m 55\u001b[0m \u001b[0mtype_err\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infinity'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mallow_nan\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'NaN, infinity'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_err\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;31m# for object dtype data, we only check for NaNs (GH-13254)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'object'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mallow_nan\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float32')."
	]
	}
	],
	"source": [
	"# scikit-learn's Random Forest™ completely breaks\n",
	"(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
	" .fit(X_train, y_train)\n",
	" .score(X_test, y_test))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 60,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.9555555555555556"
	]
	},
	"execution_count": 60,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Create our imputer to replace missing values with the median\n",
	"imp = SimpleImputer(missing_values=np.nan, strategy='median')\n",
	"imp = imp.fit(X_train)\n",
	"\n",
	"# Impute our data\n",
	"X_train_imp = imp.transform(X_train)\n",
	"X_test_imp = imp.transform(X_test)\n",
	"\n",
	"(RandomForestClassifier(n_estimators=20, min_samples_leaf=3, max_features='sqrt')\n",
	" .fit(X_train_imp, y_train)\n",
	" .score(X_test_imp, y_test))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"> __Missing value replacement for the test set__\n",
	"\n",
	"When there is a test set, there are two different methods of replacement depending on whether labels exist for the test set.\n",
	"\n",
	"If they do, then the fills derived from the training set are used as replacements. If labels no not exist, then each case in the test set is replicated nclass times (nclass= number of classes). The first replicate of a case is assumed to be class 1 and the class one fills used to replace missing values. The 2nd replicate is assumed class 2 and the class 2 fills used on it.\n",
	"\n",
	"This augmented test set is run down the tree. In each set of replicates, the one receiving the most votes determines the class of the original case.\n",
	"\n",
	"https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#missing2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 61,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.9555555555555556"
	]
	},
	"execution_count": 61,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# XGBoost call handle it\n",
	"(XGBClassifier()\n",
	".fit(X_train, y_train)\n",
	".score(X_test, y_test))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Summary\n",
	"-----\n",
	"\n",
	"Random Forest™ needs to impute missing values.\n",
	"\n",
	"XGBoost can handle missing values"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	" "
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.5"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": false,
	"sideBar": false,
	"skip_h1_title": false,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": true,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}