raghavrv · February 22, 2016 14:16
diff --git a/.gitignore b/.gitignore
 *swp
 *ipynb_checkpoints
 *build
 *.dat
diff --git a/baseline_score.npy b/baseline_score.npy
diff --git a/log b/log
 The shape of the dataset is (581012, 54)
 The number of trees for this benchmarking is 100
 Score with the entire dataset = 0.95
 Score RF with the 1.82173199202 % missing = 0.94
 Score RF+Imp. with the 1.82173199202 % missing = 0.95
 Score RF with the 3.55273467929 % missing = 0.96
 Score RF+Imp. with the 3.55273467929 % missing = 0.96
 Score RF with the 5.19537302857 % missing = 0.96
 Score RF+Imp. with the 5.19537302857 % missing = 0.96
 Score RF with the 6.75907503408 % missing = 0.97
 Score RF+Imp. with the 6.75907503408 % missing = 0.97
 Score RF with the 8.24469488869 % missing = 0.97
 Score RF+Imp. with the 8.24469488869 % missing = 0.97
 Score RF with the 9.65472823791 % missing = 0.97
 Score RF+Imp. with the 9.65472823791 % missing = 0.97
 Score RF with the 10.9961424906 % missing = 0.97
 Score RF+Imp. with the 10.9961424906 % missing = 0.98
 Score RF with the 12.2712707406 % missing = 0.98
 Score RF+Imp. with the 12.2712707406 % missing = 0.98
 Score RF with the 13.4793894739 % missing = 0.98
 Score RF+Imp. with the 13.4793894739 % missing = 0.98
 Score RF with the 14.6313482146 % missing = 0.98
 Score RF+Imp. with the 14.6313482146 % missing = 0.98
 Score RF with the 15.722225792 % missing = 0.98
 Score RF+Imp. with the 15.722225792 % missing = 0.98
 Score RF with the 16.7580461779 % missing = 0.98
 Score RF+Imp. with the 16.7580461779 % missing = 0.98
 Score RF with the 17.7444126226 % missing = 0.98
 Score RF+Imp. with the 17.7444126226 % missing = 0.98
 Score RF with the 18.6789537846 % missing = 0.98
 Score RF+Imp. with the 18.6789537846 % missing = 0.99
 Score RF with the 19.5694338945 % missing = 0.98
 Score RF+Imp. with the 19.5694338945 % missing = 0.98
 Score RF with the 20.4148903918 % missing = 0.98
 Score RF+Imp. with the 20.4148903918 % missing = 0.98
 Score RF with the 21.2185360613 % missing = 0.98
 Score RF+Imp. with the 21.2185360613 % missing = 0.99
 Score RF with the 21.9800458 % missing = 0.98
 Score RF+Imp. with the 21.9800458 % missing = 0.99
 Score RF with the 22.7055551348 % missing = 0.98
 Score RF+Imp. with the 22.7055551348 % missing = 0.99
 Score RF with the 23.3928265904 % missing = 0.98
 Score RF+Imp. with the 23.3928265904 % missing = 0.99
 Score RF with the 24.0459845159 % missing = 0.98
 Score RF+Imp. with the 24.0459845159 % missing = 0.99
 Score RF with the 24.6653890746 % missing = 0.98
 Score RF+Imp. with the 24.6653890746 % missing = 0.99
 Score RF with the 25.2552315487 % missing = 0.98
 Score RF+Imp. with the 25.2552315487 % missing = 0.99
 Score RF with the 25.8147979859 % missing = 0.98
 Score RF+Imp. with the 25.8147979859 % missing = 0.99
 Score RF with the 26.347275673 % missing = 0.98
 Score RF+Imp. with the 26.347275673 % missing = 0.99
 Score RF with the 26.8534359334 % missing = 0.98
 Score RF+Imp. with the 26.8534359334 % missing = 0.99
 Score RF with the 27.3326508715 % missing = 0.98
 Score RF+Imp. with the 27.3326508715 % missing = 0.99
 Score RF with the 27.7895579896 % missing = 0.98
 Score RF+Imp. with the 27.7895579896 % missing = 0.99
 Score RF with the 28.2251995305 % missing = 0.98
 Score RF+Imp. with the 28.2251995305 % missing = 0.99
 Score RF with the 28.6352089113 % missing = 0.98
 Score RF+Imp. with the 28.6352089113 % missing = 0.99
 Score RF with the 29.0269296408 % missing = 0.99
 Score RF+Imp. with the 29.0269296408 % missing = 0.99
 Score RF with the 29.3992557303 % missing = 0.99
 Score RF+Imp. with the 29.3992557303 % missing = 0.99
 Score RF with the 29.7540389935 % missing = 0.98
 Score RF+Imp. with the 29.7540389935 % missing = 0.99
 Score RF with the 30.0889495238 % missing = 0.99
 Score RF+Imp. with the 30.0889495238 % missing = 0.99
 Score RF with the 30.4072383537 % missing = 0.99
 Score RF+Imp. with the 30.4072383537 % missing = 0.99
 Score RF with the 30.7117740413 % missing = 0.99
 Score RF+Imp. with the 30.7117740413 % missing = 0.99
 Score RF with the 30.9991270659 % missing = 0.99
 Score RF+Imp. with the 30.9991270659 % missing = 0.99
 Score RF with the 31.2725134 % missing = 0.99
 Score RF+Imp. with the 31.2725134 % missing = 0.99
 Score RF with the 31.5319171071 % missing = 0.99
 Score RF+Imp. with the 31.5319171071 % missing = 0.99
 Score RF with the 31.7783836172 % missing = 0.99
 Score RF+Imp. with the 31.7783836172 % missing = 0.99
 he
diff --git a/miss_val_bench.py b/miss_val_bench.py
 import numpy as np
 import matplotlib.pyplot as plt

 from sklearn.datasets import fetch_covtype, load_digits, load_iris
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import StratifiedShuffleSplit
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import Imputer
 from sklearn.model_selection import cross_val_score

 rng = np.random.RandomState(0)

 from time import time

 # dataset = load_digits()
 # dataset = load_iris()
 dataset = fetch_covtype()
 X, y = dataset.data, dataset.target

 # Take only 2 classes
 # mask = y < 3
 # mask = (y == 1) | (y == 2)
 # X = X[mask]
 # y = y[mask]
 # plt.hist(y)
 # plt.show()
 # X, y = X[::20].copy(), y[::20].copy()
 X, y = X[::2].copy(), y[::2].copy()

 n_samples, n_features = X.shape

 n_estimators = 100
 n_jobs = -1

 rng = np.random.RandomState(42)

 cv = StratifiedShuffleSplit(n_iter=3, test_size=0.3, random_state=rng)

 print "The shape of the dataset is %s" % str(X.shape)
 print "The number of trees for this benchmarking is %s" % n_estimators

 start = time()
 # Estimate the score on the entire dataset, with no missing values
 estimator = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
                                   missing_values=None, n_jobs=n_jobs)
 score = cross_val_score(estimator, X, y, cv=cv).mean()
 end = time()
 print "Score with the entire dataset = %.2f in %d seconds" % (score, end - start)

 baseline_score = score

 scores_missing = []
 scores_impute = []

 rf_missing = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
                                    missing_values='NaN', n_jobs=n_jobs)
 rf_impute = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="median", axis=0)),
                      ("forest", RandomForestClassifier(
                                         random_state=0,
                                         n_estimators=n_estimators,
                                         n_jobs=n_jobs))])

 missing_fraction_range = []
 missing_mask = np.zeros(X.shape, dtype=bool)

 X_missing = X.copy()
 X_missing_feat_min = X.copy()
 for _ in range(70):
    rv = rng.randn(*X.shape)
    thresh = np.sort(rv.ravel())[int(0.05 * n_samples * n_features)]
    missing_mask += rv < thresh
    missing_mask[y!=2] = False  # Features should go missing only for y=1
    missing_fraction = np.mean(missing_mask)
    missing_fraction_range.append(missing_fraction)
    X_missing[missing_mask] = np.nan

    train, test = iter(cv.split(X, y)).next()
    # print(len(train), len(test))
    # score_missing = rf_missing.fit(X_missing[train], y[train]).score(X[test], y[test])
    # score_impute = rf_impute.fit(X_missing[train], y[train]).score(X[test], y[test])

    start = time()
    score_missing = cross_val_score(rf_missing, X_missing, y, cv=cv).mean()
    end = time()
    scores_missing.append(score_missing)
    print ("Score RF with the %s %% missing = %.2f in %d seconds"
           % (missing_fraction*100, score_missing, end - start))

    start = time()
    score_impute = cross_val_score(rf_impute, X_missing, y, cv=cv).mean()
    end = time()
    scores_impute.append(score_impute)
    print ("Score RF+Imp. with the %s %% missing = %.2f in %d seconds"
           % (missing_fraction*100, score_impute, end - start))

 np.save('scores_missing.npy', scores_missing)
 np.save('scores_impute.npy', scores_impute)
 np.save('missing_fraction_range.npy', missing_fraction_range)
 np.save('baseline_score.npy', baseline_score)
diff --git a/miss_val_plot.py b/miss_val_plot.py
 import numpy as np
 import matplotlib.pyplot as plt

 baseline_score = np.load('baseline_score.npy')
 missing_fraction_range = np.load('missing_fraction_range.npy')
 scores_missing = np.load('scores_missing.npy')
 scores_impute = np.load('scores_impute.npy')

 plt.close('all')
 plt.plot(missing_fraction_range, scores_missing, 'o--', color='r', label='RF mv')
 plt.plot(missing_fraction_range, scores_impute, 'o--', color='b', label='RF imp.')
 plt.axhline(baseline_score, label='no missing', color='k')
 plt.xlabel('Missing fraction')
 plt.ylabel('Score')
 plt.legend(loc='best')
 plt.show()
diff --git a/missing_fraction_range.npy b/missing_fraction_range.npy
diff --git a/MissingValRF_alex_desktop.ipynb b/MissingValRF_alex_desktop.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "At host \"tsilinuxd74\" with 16 cores. Current Dir - /cal/homes/vrajagopalan/raghav/miss_val_bench\n",
      "sklearn 0.18.dev0 in branch \"missing_values_rf\", (last commit \"359cc5d\")- np v1.10.4 - scipy v0.17.0\n",
      "Running on IPython v4.0.3; Python 2.7.11 :: Anaconda 2.5.0 (64-bit)\n",
      "@ /tsi/doctorants/raghav/anaconda/anaconda3\n"
     ]
    }
   ],
   "source": [
    "# Confirm if this is Alex's PC\n",
    "import IPython\n",
    "import sklearn, numpy as np, scipy\n",
    "from ast import literal_eval\n",
    "CURR_IPYTHON_VERSION = IPython.__version__\n",
    "PYTHON_INPT_VERSION = literal_eval(IPython.sys_info())['sys_executable'] + \" --version\"\n",
    "SKVERSION = sklearn.__version__; SCVERSION = scipy.__version__; NPVERSION = np.__version__\n",
    "!echo \"At host \\\"$(hostname)\\\" with $(nproc) cores. Current Dir - $(pwd)\"; \n",
    "!echo -n \"sklearn $(python -c 'import sklearn; print sklearn.__version__') \"\n",
    "!echo -n \"in branch \\\"$(git --git-dir $HOME/raghav/scikit-learn/.git rev-parse --abbrev-ref HEAD)\\\", \"\n",
    "!echo -n \"(last commit \\\"$(git --git-dir $HOME/raghav/scikit-learn/.git log --pretty=format:'%h' -n 1)\\\")\"\n",
    "!echo -e -n \"- np v$NPVERSION - scipy v$SCVERSION\\nRunning on IPython v$CURR_IPYTHON_VERSION; \"\n",
    "!echo -n \"`$PYTHON_INPT_VERSION`\"\n",
    "!echo \"@ /tsi/doctorants/raghav/anaconda/anaconda3\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np\n",
    "plt.rcParams['figure.figsize'][:] = [10, 10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/rvraghav93/Desktop/scikit_sandbox/adult_dataset\n"
     ]
    }
   ],
   "source": [
    "cd ../adult_dataset/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "baseline_score = np.load('baseline_score.npy')\n",
    "missing_fraction_range = np.load('missing_fraction_range.npy')\n",
    "scores_missing = np.load('scores_missing.npy')\n",
    "scores_impute = np.load('scores_impute.npy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.close('all')\n",
    "plt.plot(missing_fraction_range, seconds_missing, '.--', color='r', label='RF MV enabled')\n",
    "plt.plot(missing_fraction_range, seconds_impute, '.--', color='b', label='RF+imputer')\n",
    "plt.axhline(35, label='RF w/No missing', color='k')\n",
    "#for sample_pt in missing_fraction_range:\n",
    "#    plt.axvline(sample_pt, linestyle='--', color='g')\n",
    "plt.xlabel('Missing fraction')\n",
    "plt.ylabel('Time taken for cross_val_score using 3 iterations of StratifiedShuffleSplit in seconds')\n",
    "plt.legend(loc='best')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_mldata\n",
    "\n",
    "adult = fetch_mldata('yeast')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cat_feats = np.load('cat_feats.npy').tolist()\n",
    "feat_names = np.load('feat_names.npy').tolist()\n",
    "data = np.load('data.npy')\n",
    "target = np.load('target.npy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((48842, 12), dtype('float64'))"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape, data.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.1030465582899962"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(np.isnan(data)) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import Imputer\n",
    "from sklearn.model_selection import cross_val_score\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The cross val score with RF (MV) (computed in  57.51 seconds). 0.8525\n",
      "The cross val score with RF+Imp(Median) (computed in  56.93 seconds). 0.8488\n",
      "The cross val score with RF+Imp(Mean) (computed in  59.11 seconds). 0.8486\n"
     ]
    }
   ],
   "source": [
    "rf_missing = RandomForestClassifier(n_estimators=100,\n",
    "                                    missing_values='NaN',\n",
    "                                    n_jobs=-1)\n",
    "\n",
    "rf_impute = Pipeline([('imp', Imputer(strategy='mean')), \n",
    "                      ('rf', RandomForestClassifier(n_estimators=100,\n",
    "                                                    n_jobs=-1))])\n",
    "\n",
    "\n",
    "rf_impute2 = Pipeline([('imp', Imputer(strategy='mean')), \n",
    "                      ('rf', RandomForestClassifier(n_estimators=100,\n",
    "                                                    n_jobs=-1))])\n",
    "\n",
    "cv = StratifiedKFold(n_folds=15)\n",
    "\n",
    "\n",
    "print \"The cross val score with RF (MV) (computed in \", \n",
    "t = time.time()\n",
    "cv_rf_missing = cross_val_score(rf_missing, X=data, y=target, cv=cv)\n",
    "t -= time.time()\n",
    "print \"%0.2f seconds).\" % abs(t), \n",
    "print \"%0.4f\" % np.mean(cv_rf_missing)\n",
    "\n",
    "print \"The cross val score with RF+Imp(Median) (computed in \", \n",
    "t = time.time()\n",
    "cv_rf_imp_median = cross_val_score(rf_impute, X=data, y=target, cv=cv)\n",
    "t -= time.time()\n",
    "print \"%0.2f seconds).\" % abs(t), \n",
    "print \"%0.4f\" % np.mean(cv_rf_imp_median)\n",
    "\n",
    "print \"The cross val score with RF+Imp(Mean) (computed in \", \n",
    "t = time.time()\n",
    "cv_rf_imp_mean = cross_val_score(rf_impute2, X=data, y=target, cv=cv)\n",
    "t -= time.time()\n",
    "print \"%0.2f seconds).\" % abs(t), \n",
    "print \"%0.4f\" % np.mean(cv_rf_imp_mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(48842, 12)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(6, 1)\n",
      "[1 1 1 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "km = KMeans(n_clusters=2)\n",
    "X = np.array([[1], [2], [3], [10], [12], [13]])\n",
    "km.fit(X)\n",
    "\n",
    "print X.shape\n",
    "\n",
    "print km.labels_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/rvraghav93/.local/lib/python2.7/site-packages/sklearn/utils/validation.py:407: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=1, n_init=10,\n",
       "    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,\n",
       "    verbose=0)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "km = KMeans(n_clusters=1)\n",
    "km.fit(np.ravel(X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0], dtype=int32)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "km.labels_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.tree import export_graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "export_graphviz?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The shape of the dataset is (581012, 54)\r\n",
      "The number of trees for this benchmarking is 100\r\n",
      "Score with the entire dataset = 0.95\r\n",
      "Score RF with the 1.82173199202 % missing = 0.94\r\n",
      "Score RF+Imp. with the 1.82173199202 % missing = 0.95\r\n",
      "Score RF with the 3.55273467929 % missing = 0.96\r\n",
      "Score RF+Imp. with the 3.55273467929 % missing = 0.96\r\n",
      "Score RF with the 5.19537302857 % missing = 0.96\r\n",
      "Score RF+Imp. with the 5.19537302857 % missing = 0.96\r\n",
      "Score RF with the 6.75907503408 % missing = 0.97\r\n",
      "Score RF+Imp. with the 6.75907503408 % missing = 0.97\r\n",
      "Score RF with the 8.24469488869 % missing = 0.97\r\n",
      "Score RF+Imp. with the 8.24469488869 % missing = 0.97\r\n",
      "Score RF with the 9.65472823791 % missing = 0.97\r\n",
      "Score RF+Imp. with the 9.65472823791 % missing = 0.97\r\n",
      "Score RF with the 10.9961424906 % missing = 0.97\r\n",
      "Score RF+Imp. with the 10.9961424906 % missing = 0.98\r\n",
      "Score RF with the 12.2712707406 % missing = 0.98\r\n",
      "Score RF+Imp. with the 12.2712707406 % missing = 0.98\r\n",
      "Score RF with the 13.4793894739 % missing = 0.98\r\n",
      "Score RF+Imp. with the 13.4793894739 % missing = 0.98\r\n",
      "Score RF with the 14.6313482146 % missing = 0.98\r\n",
      "Score RF+Imp. with the 14.6313482146 % missing = 0.98\r\n",
      "Score RF with the 15.722225792 % missing = 0.98\r\n",
      "Score RF+Imp. with the 15.722225792 % missing = 0.98\r\n",
      "Score RF with the 16.7580461779 % missing = 0.98\r\n",
      "Score RF+Imp. with the 16.7580461779 % missing = 0.98\r\n",
      "Score RF with the 17.7444126226 % missing = 0.98\r\n",
      "Score RF+Imp. with the 17.7444126226 % missing = 0.98\r\n",
      "Score RF with the 18.6789537846 % missing = 0.98\r\n",
      "Score RF+Imp. with the 18.6789537846 % missing = 0.99\r\n",
      "Score RF with the 19.5694338945 % missing = 0.98\r\n",
      "Score RF+Imp. with the 19.5694338945 % missing = 0.98\r\n",
      "Score RF with the 20.4148903918 % missing = 0.98\r\n",
      "Score RF+Imp. with the 20.4148903918 % missing = 0.98\r\n",
      "Score RF with the 21.2185360613 % missing = 0.98\r\n",
      "Score RF+Imp. with the 21.2185360613 % missing = 0.99\r\n",
      "Score RF with the 21.9800458 % missing = 0.98\r\n",
      "Score RF+Imp. with the 21.9800458 % missing = 0.99\r\n",
      "Score RF with the 22.7055551348 % missing = 0.98\r\n",
      "Score RF+Imp. with the 22.7055551348 % missing = 0.99\r\n",
      "Score RF with the 23.3928265904 % missing = 0.98\r\n",
      "Score RF+Imp. with the 23.3928265904 % missing = 0.99\r\n",
      "Score RF with the 24.0459845159 % missing = 0.98\r\n",
      "Score RF+Imp. with the 24.0459845159 % missing = 0.99\r\n",
      "Score RF with the 24.6653890746 % missing = 0.98\r\n",
      "Score RF+Imp. with the 24.6653890746 % missing = 0.99\r\n",
      "Score RF with the 25.2552315487 % missing = 0.98\r\n",
      "Score RF+Imp. with the 25.2552315487 % missing = 0.99\r\n",
      "Score RF with the 25.8147979859 % missing = 0.98\r\n",
      "Score RF+Imp. with the 25.8147979859 % missing = 0.99\r\n",
      "Score RF with the 26.347275673 % missing = 0.98\r\n",
      "Score RF+Imp. with the 26.347275673 % missing = 0.99\r\n",
      "Score RF with the 26.8534359334 % missing = 0.98\r\n",
      "Score RF+Imp. with the 26.8534359334 % missing = 0.99\r\n",
      "Score RF with the 27.3326508715 % missing = 0.98\r\n",
      "Score RF+Imp. with the 27.3326508715 % missing = 0.99\r\n",
      "Score RF with the 27.7895579896 % missing = 0.98\r\n",
      "Score RF+Imp. with the 27.7895579896 % missing = 0.99\r\n",
      "Score RF with the 28.2251995305 % missing = 0.98\r\n",
      "Score RF+Imp. with the 28.2251995305 % missing = 0.99\r\n",
      "Score RF with the 28.6352089113 % missing = 0.98\r\n",
      "Score RF+Imp. with the 28.6352089113 % missing = 0.99\r\n",
      "Score RF with the 29.0269296408 % missing = 0.99\r\n",
      "Score RF+Imp. with the 29.0269296408 % missing = 0.99\r\n",
      "Score RF with the 29.3992557303 % missing = 0.99\r\n",
      "Score RF+Imp. with the 29.3992557303 % missing = 0.99\r\n",
      "Score RF with the 29.7540389935 % missing = 0.98\r\n",
      "Score RF+Imp. with the 29.7540389935 % missing = 0.99\r\n",
      "Score RF with the 30.0889495238 % missing = 0.99\r\n",
      "Score RF+Imp. with the 30.0889495238 % missing = 0.99\r\n",
      "Score RF with the 30.4072383537 % missing = 0.99\r\n",
      "Score RF+Imp. with the 30.4072383537 % missing = 0.99\r\n",
      "Score RF with the 30.7117740413 % missing = 0.99\r\n",
      "Score RF+Imp. with the 30.7117740413 % missing = 0.99\r\n",
      "Score RF with the 30.9991270659 % missing = 0.99\r\n",
      "Score RF+Imp. with the 30.9991270659 % missing = 0.99\r\n",
      "Score RF with the 31.2725134 % missing = 0.99\r\n",
      "Score RF+Imp. with the 31.2725134 % missing = 0.99\r\n",
      "Score RF with the 31.5319171071 % missing = 0.99\r\n",
      "Score RF+Imp. with the 31.5319171071 % missing = 0.99\r\n",
      "Score RF with the 31.7783836172 % missing = 0.99\r\n",
      "Score RF+Imp. with the 31.7783836172 % missing = 0.99\r\n",
      "he\r\n"
     ]
    }
   ],
   "source": [
    "k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
diff --git a/MissingValRF_ragv_desktop.ipynb b/MissingValRF_ragv_desktop.ipynb
diff --git a/scores_impute.npy b/scores_impute.npy
diff --git a/scores_missing.npy b/scores_missing.npy
	The shape of the dataset is (581012, 54)
	The number of trees for this benchmarking is 100
	Score with the entire dataset = 0.95
	Score RF with the 1.82173199202 % missing = 0.94
	Score RF+Imp. with the 1.82173199202 % missing = 0.95
	Score RF with the 3.55273467929 % missing = 0.96
	Score RF+Imp. with the 3.55273467929 % missing = 0.96
	Score RF with the 5.19537302857 % missing = 0.96
	Score RF+Imp. with the 5.19537302857 % missing = 0.96
	Score RF with the 6.75907503408 % missing = 0.97
	Score RF+Imp. with the 6.75907503408 % missing = 0.97
	Score RF with the 8.24469488869 % missing = 0.97
	Score RF+Imp. with the 8.24469488869 % missing = 0.97
	Score RF with the 9.65472823791 % missing = 0.97
	Score RF+Imp. with the 9.65472823791 % missing = 0.97
	Score RF with the 10.9961424906 % missing = 0.97
	Score RF+Imp. with the 10.9961424906 % missing = 0.98
	Score RF with the 12.2712707406 % missing = 0.98
	Score RF+Imp. with the 12.2712707406 % missing = 0.98
	Score RF with the 13.4793894739 % missing = 0.98
	Score RF+Imp. with the 13.4793894739 % missing = 0.98
	Score RF with the 14.6313482146 % missing = 0.98
	Score RF+Imp. with the 14.6313482146 % missing = 0.98
	Score RF with the 15.722225792 % missing = 0.98
	Score RF+Imp. with the 15.722225792 % missing = 0.98
	Score RF with the 16.7580461779 % missing = 0.98
	Score RF+Imp. with the 16.7580461779 % missing = 0.98
	Score RF with the 17.7444126226 % missing = 0.98
	Score RF+Imp. with the 17.7444126226 % missing = 0.98
	Score RF with the 18.6789537846 % missing = 0.98
	Score RF+Imp. with the 18.6789537846 % missing = 0.99
	Score RF with the 19.5694338945 % missing = 0.98
	Score RF+Imp. with the 19.5694338945 % missing = 0.98
	Score RF with the 20.4148903918 % missing = 0.98
	Score RF+Imp. with the 20.4148903918 % missing = 0.98
	Score RF with the 21.2185360613 % missing = 0.98
	Score RF+Imp. with the 21.2185360613 % missing = 0.99
	Score RF with the 21.9800458 % missing = 0.98
	Score RF+Imp. with the 21.9800458 % missing = 0.99
	Score RF with the 22.7055551348 % missing = 0.98
	Score RF+Imp. with the 22.7055551348 % missing = 0.99
	Score RF with the 23.3928265904 % missing = 0.98
	Score RF+Imp. with the 23.3928265904 % missing = 0.99
	Score RF with the 24.0459845159 % missing = 0.98
	Score RF+Imp. with the 24.0459845159 % missing = 0.99
	Score RF with the 24.6653890746 % missing = 0.98
	Score RF+Imp. with the 24.6653890746 % missing = 0.99
	Score RF with the 25.2552315487 % missing = 0.98
	Score RF+Imp. with the 25.2552315487 % missing = 0.99
	Score RF with the 25.8147979859 % missing = 0.98
	Score RF+Imp. with the 25.8147979859 % missing = 0.99
	Score RF with the 26.347275673 % missing = 0.98
	Score RF+Imp. with the 26.347275673 % missing = 0.99
	Score RF with the 26.8534359334 % missing = 0.98
	Score RF+Imp. with the 26.8534359334 % missing = 0.99
	Score RF with the 27.3326508715 % missing = 0.98
	Score RF+Imp. with the 27.3326508715 % missing = 0.99
	Score RF with the 27.7895579896 % missing = 0.98
	Score RF+Imp. with the 27.7895579896 % missing = 0.99
	Score RF with the 28.2251995305 % missing = 0.98
	Score RF+Imp. with the 28.2251995305 % missing = 0.99
	Score RF with the 28.6352089113 % missing = 0.98
	Score RF+Imp. with the 28.6352089113 % missing = 0.99
	Score RF with the 29.0269296408 % missing = 0.99
	Score RF+Imp. with the 29.0269296408 % missing = 0.99
	Score RF with the 29.3992557303 % missing = 0.99
	Score RF+Imp. with the 29.3992557303 % missing = 0.99
	Score RF with the 29.7540389935 % missing = 0.98
	Score RF+Imp. with the 29.7540389935 % missing = 0.99
	Score RF with the 30.0889495238 % missing = 0.99
	Score RF+Imp. with the 30.0889495238 % missing = 0.99
	Score RF with the 30.4072383537 % missing = 0.99
	Score RF+Imp. with the 30.4072383537 % missing = 0.99
	Score RF with the 30.7117740413 % missing = 0.99
	Score RF+Imp. with the 30.7117740413 % missing = 0.99
	Score RF with the 30.9991270659 % missing = 0.99
	Score RF+Imp. with the 30.9991270659 % missing = 0.99
	Score RF with the 31.2725134 % missing = 0.99
	Score RF+Imp. with the 31.2725134 % missing = 0.99
	Score RF with the 31.5319171071 % missing = 0.99
	Score RF+Imp. with the 31.5319171071 % missing = 0.99
	Score RF with the 31.7783836172 % missing = 0.99
	Score RF+Imp. with the 31.7783836172 % missing = 0.99
	he
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.datasets import fetch_covtype, load_digits, load_iris
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import StratifiedShuffleSplit
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import Imputer
	from sklearn.model_selection import cross_val_score

	rng = np.random.RandomState(0)

	from time import time

	# dataset = load_digits()
	# dataset = load_iris()
	dataset = fetch_covtype()
	X, y = dataset.data, dataset.target

	# Take only 2 classes
	# mask = y < 3
	# mask = (y == 1) \| (y == 2)
	# X = X[mask]
	# y = y[mask]
	# plt.hist(y)
	# plt.show()
	# X, y = X[::20].copy(), y[::20].copy()
	X, y = X[::2].copy(), y[::2].copy()

	n_samples, n_features = X.shape

	n_estimators = 100
	n_jobs = -1

	rng = np.random.RandomState(42)

	cv = StratifiedShuffleSplit(n_iter=3, test_size=0.3, random_state=rng)

	print "The shape of the dataset is %s" % str(X.shape)
	print "The number of trees for this benchmarking is %s" % n_estimators

	start = time()
	# Estimate the score on the entire dataset, with no missing values
	estimator = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
	missing_values=None, n_jobs=n_jobs)
	score = cross_val_score(estimator, X, y, cv=cv).mean()
	end = time()
	print "Score with the entire dataset = %.2f in %d seconds" % (score, end - start)

	baseline_score = score

	scores_missing = []
	scores_impute = []

	rf_missing = RandomForestClassifier(random_state=0, n_estimators=n_estimators,
	missing_values='NaN', n_jobs=n_jobs)
	rf_impute = Pipeline([("imputer", Imputer(missing_values='NaN',
	strategy="median", axis=0)),
	("forest", RandomForestClassifier(
	random_state=0,
	n_estimators=n_estimators,
	n_jobs=n_jobs))])

	missing_fraction_range = []
	missing_mask = np.zeros(X.shape, dtype=bool)

	X_missing = X.copy()
	X_missing_feat_min = X.copy()
	for _ in range(70):
	rv = rng.randn(*X.shape)
	thresh = np.sort(rv.ravel())[int(0.05 * n_samples * n_features)]
	missing_mask += rv < thresh
	missing_mask[y!=2] = False # Features should go missing only for y=1
	missing_fraction = np.mean(missing_mask)
	missing_fraction_range.append(missing_fraction)
	X_missing[missing_mask] = np.nan

	train, test = iter(cv.split(X, y)).next()
	# print(len(train), len(test))
	# score_missing = rf_missing.fit(X_missing[train], y[train]).score(X[test], y[test])
	# score_impute = rf_impute.fit(X_missing[train], y[train]).score(X[test], y[test])

	start = time()
	score_missing = cross_val_score(rf_missing, X_missing, y, cv=cv).mean()
	end = time()
	scores_missing.append(score_missing)
	print ("Score RF with the %s %% missing = %.2f in %d seconds"
	% (missing_fraction*100, score_missing, end - start))

	start = time()
	score_impute = cross_val_score(rf_impute, X_missing, y, cv=cv).mean()
	end = time()
	scores_impute.append(score_impute)
	print ("Score RF+Imp. with the %s %% missing = %.2f in %d seconds"
	% (missing_fraction*100, score_impute, end - start))

	np.save('scores_missing.npy', scores_missing)
	np.save('scores_impute.npy', scores_impute)
	np.save('missing_fraction_range.npy', missing_fraction_range)
	np.save('baseline_score.npy', baseline_score)
	import numpy as np
	import matplotlib.pyplot as plt

	baseline_score = np.load('baseline_score.npy')
	missing_fraction_range = np.load('missing_fraction_range.npy')
	scores_missing = np.load('scores_missing.npy')
	scores_impute = np.load('scores_impute.npy')

	plt.close('all')
	plt.plot(missing_fraction_range, scores_missing, 'o--', color='r', label='RF mv')
	plt.plot(missing_fraction_range, scores_impute, 'o--', color='b', label='RF imp.')
	plt.axhline(baseline_score, label='no missing', color='k')
	plt.xlabel('Missing fraction')
	plt.ylabel('Score')
	plt.legend(loc='best')
	plt.show()
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"At host \"tsilinuxd74\" with 16 cores. Current Dir - /cal/homes/vrajagopalan/raghav/miss_val_bench\n",
	"sklearn 0.18.dev0 in branch \"missing_values_rf\", (last commit \"359cc5d\")- np v1.10.4 - scipy v0.17.0\n",
	"Running on IPython v4.0.3; Python 2.7.11 :: Anaconda 2.5.0 (64-bit)\n",
	"@ /tsi/doctorants/raghav/anaconda/anaconda3\n"
	]
	}
	],
	"source": [
	"# Confirm if this is Alex's PC\n",
	"import IPython\n",
	"import sklearn, numpy as np, scipy\n",
	"from ast import literal_eval\n",
	"CURR_IPYTHON_VERSION = IPython.__version__\n",
	"PYTHON_INPT_VERSION = literal_eval(IPython.sys_info())['sys_executable'] + \" --version\"\n",
	"SKVERSION = sklearn.__version__; SCVERSION = scipy.__version__; NPVERSION = np.__version__\n",
	"!echo \"At host \\\"$(hostname)\\\" with $(nproc) cores. Current Dir - $(pwd)\"; \n",
	"!echo -n \"sklearn $(python -c 'import sklearn; print sklearn.__version__') \"\n",
	"!echo -n \"in branch \\\"$(git --git-dir $HOME/raghav/scikit-learn/.git rev-parse --abbrev-ref HEAD)\\\", \"\n",
	"!echo -n \"(last commit \\\"$(git --git-dir $HOME/raghav/scikit-learn/.git log --pretty=format:'%h' -n 1)\\\")\"\n",
	"!echo -e -n \"- np v$NPVERSION - scipy v$SCVERSION\\nRunning on IPython v$CURR_IPYTHON_VERSION; \"\n",
	"!echo -n \"`$PYTHON_INPT_VERSION`\"\n",
	"!echo \"@ /tsi/doctorants/raghav/anaconda/anaconda3\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"%matplotlib inline\n",
	"from matplotlib import pyplot as plt\n",
	"import numpy as np\n",
	"plt.rcParams['figure.figsize'][:] = [10, 10]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"/home/rvraghav93/Desktop/scikit_sandbox/adult_dataset\n"
	]
	}
	],
	"source": [
	"cd ../adult_dataset/"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"\n",
	"baseline_score = np.load('baseline_score.npy')\n",
	"missing_fraction_range = np.load('missing_fraction_range.npy')\n",
	"scores_missing = np.load('scores_missing.npy')\n",
	"scores_impute = np.load('scores_impute.npy')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"plt.close('all')\n",
	"plt.plot(missing_fraction_range, seconds_missing, '.--', color='r', label='RF MV enabled')\n",
	"plt.plot(missing_fraction_range, seconds_impute, '.--', color='b', label='RF+imputer')\n",
	"plt.axhline(35, label='RF w/No missing', color='k')\n",
	"#for sample_pt in missing_fraction_range:\n",
	"# plt.axvline(sample_pt, linestyle='--', color='g')\n",
	"plt.xlabel('Missing fraction')\n",
	"plt.ylabel('Time taken for cross_val_score using 3 iterations of StratifiedShuffleSplit in seconds')\n",
	"plt.legend(loc='best')\n",
	"plt.show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.datasets import fetch_mldata\n",
	"\n",
	"adult = fetch_mldata('yeast')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"cat_feats = np.load('cat_feats.npy').tolist()\n",
	"feat_names = np.load('feat_names.npy').tolist()\n",
	"data = np.load('data.npy')\n",
	"target = np.load('target.npy')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"((48842, 12), dtype('float64'))"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data.shape, data.dtype"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1.1030465582899962"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"np.mean(np.isnan(data)) * 100"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.ensemble import RandomForestClassifier\n",
	"from sklearn.model_selection import StratifiedShuffleSplit\n",
	"from sklearn.model_selection import StratifiedKFold\n",
	"from sklearn.pipeline import Pipeline\n",
	"from sklearn.preprocessing import Imputer\n",
	"from sklearn.model_selection import cross_val_score\n",
	"import time"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The cross val score with RF (MV) (computed in 57.51 seconds). 0.8525\n",
	"The cross val score with RF+Imp(Median) (computed in 56.93 seconds). 0.8488\n",
	"The cross val score with RF+Imp(Mean) (computed in 59.11 seconds). 0.8486\n"
	]
	}
	],
	"source": [
	"rf_missing = RandomForestClassifier(n_estimators=100,\n",
	" missing_values='NaN',\n",
	" n_jobs=-1)\n",
	"\n",
	"rf_impute = Pipeline([('imp', Imputer(strategy='mean')), \n",
	" ('rf', RandomForestClassifier(n_estimators=100,\n",
	" n_jobs=-1))])\n",
	"\n",
	"\n",
	"rf_impute2 = Pipeline([('imp', Imputer(strategy='mean')), \n",
	" ('rf', RandomForestClassifier(n_estimators=100,\n",
	" n_jobs=-1))])\n",
	"\n",
	"cv = StratifiedKFold(n_folds=15)\n",
	"\n",
	"\n",
	"print \"The cross val score with RF (MV) (computed in \", \n",
	"t = time.time()\n",
	"cv_rf_missing = cross_val_score(rf_missing, X=data, y=target, cv=cv)\n",
	"t -= time.time()\n",
	"print \"%0.2f seconds).\" % abs(t), \n",
	"print \"%0.4f\" % np.mean(cv_rf_missing)\n",
	"\n",
	"print \"The cross val score with RF+Imp(Median) (computed in \", \n",
	"t = time.time()\n",
	"cv_rf_imp_median = cross_val_score(rf_impute, X=data, y=target, cv=cv)\n",
	"t -= time.time()\n",
	"print \"%0.2f seconds).\" % abs(t), \n",
	"print \"%0.4f\" % np.mean(cv_rf_imp_median)\n",
	"\n",
	"print \"The cross val score with RF+Imp(Mean) (computed in \", \n",
	"t = time.time()\n",
	"cv_rf_imp_mean = cross_val_score(rf_impute2, X=data, y=target, cv=cv)\n",
	"t -= time.time()\n",
	"print \"%0.2f seconds).\" % abs(t), \n",
	"print \"%0.4f\" % np.mean(cv_rf_imp_mean)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(48842, 12)"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"\n",
	"data.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"(6, 1)\n",
	"[1 1 1 0 0 0]\n"
	]
	}
	],
	"source": [
	"import numpy as np\n",
	"\n",
	"from sklearn.cluster import KMeans\n",
	"\n",
	"km = KMeans(n_clusters=2)\n",
	"X = np.array([[1], [2], [3], [10], [12], [13]])\n",
	"km.fit(X)\n",
	"\n",
	"print X.shape\n",
	"\n",
	"print km.labels_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/rvraghav93/.local/lib/python2.7/site-packages/sklearn/utils/validation.py:407: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.\n",
	" DeprecationWarning)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=1, n_init=10,\n",
	" n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,\n",
	" verbose=0)"
	]
	},
	"execution_count": 46,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"km = KMeans(n_clusters=1)\n",
	"km.fit(np.ravel(X))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([0], dtype=int32)"
	]
	},
	"execution_count": 47,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"km.labels_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.tree import export_graphviz"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"export_graphviz?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The shape of the dataset is (581012, 54)\r\n",
	"The number of trees for this benchmarking is 100\r\n",
	"Score with the entire dataset = 0.95\r\n",
	"Score RF with the 1.82173199202 % missing = 0.94\r\n",
	"Score RF+Imp. with the 1.82173199202 % missing = 0.95\r\n",
	"Score RF with the 3.55273467929 % missing = 0.96\r\n",
	"Score RF+Imp. with the 3.55273467929 % missing = 0.96\r\n",
	"Score RF with the 5.19537302857 % missing = 0.96\r\n",
	"Score RF+Imp. with the 5.19537302857 % missing = 0.96\r\n",
	"Score RF with the 6.75907503408 % missing = 0.97\r\n",
	"Score RF+Imp. with the 6.75907503408 % missing = 0.97\r\n",
	"Score RF with the 8.24469488869 % missing = 0.97\r\n",
	"Score RF+Imp. with the 8.24469488869 % missing = 0.97\r\n",
	"Score RF with the 9.65472823791 % missing = 0.97\r\n",
	"Score RF+Imp. with the 9.65472823791 % missing = 0.97\r\n",
	"Score RF with the 10.9961424906 % missing = 0.97\r\n",
	"Score RF+Imp. with the 10.9961424906 % missing = 0.98\r\n",
	"Score RF with the 12.2712707406 % missing = 0.98\r\n",
	"Score RF+Imp. with the 12.2712707406 % missing = 0.98\r\n",
	"Score RF with the 13.4793894739 % missing = 0.98\r\n",
	"Score RF+Imp. with the 13.4793894739 % missing = 0.98\r\n",
	"Score RF with the 14.6313482146 % missing = 0.98\r\n",
	"Score RF+Imp. with the 14.6313482146 % missing = 0.98\r\n",
	"Score RF with the 15.722225792 % missing = 0.98\r\n",
	"Score RF+Imp. with the 15.722225792 % missing = 0.98\r\n",
	"Score RF with the 16.7580461779 % missing = 0.98\r\n",
	"Score RF+Imp. with the 16.7580461779 % missing = 0.98\r\n",
	"Score RF with the 17.7444126226 % missing = 0.98\r\n",
	"Score RF+Imp. with the 17.7444126226 % missing = 0.98\r\n",
	"Score RF with the 18.6789537846 % missing = 0.98\r\n",
	"Score RF+Imp. with the 18.6789537846 % missing = 0.99\r\n",
	"Score RF with the 19.5694338945 % missing = 0.98\r\n",
	"Score RF+Imp. with the 19.5694338945 % missing = 0.98\r\n",
	"Score RF with the 20.4148903918 % missing = 0.98\r\n",
	"Score RF+Imp. with the 20.4148903918 % missing = 0.98\r\n",
	"Score RF with the 21.2185360613 % missing = 0.98\r\n",
	"Score RF+Imp. with the 21.2185360613 % missing = 0.99\r\n",
	"Score RF with the 21.9800458 % missing = 0.98\r\n",
	"Score RF+Imp. with the 21.9800458 % missing = 0.99\r\n",
	"Score RF with the 22.7055551348 % missing = 0.98\r\n",
	"Score RF+Imp. with the 22.7055551348 % missing = 0.99\r\n",
	"Score RF with the 23.3928265904 % missing = 0.98\r\n",
	"Score RF+Imp. with the 23.3928265904 % missing = 0.99\r\n",
	"Score RF with the 24.0459845159 % missing = 0.98\r\n",
	"Score RF+Imp. with the 24.0459845159 % missing = 0.99\r\n",
	"Score RF with the 24.6653890746 % missing = 0.98\r\n",
	"Score RF+Imp. with the 24.6653890746 % missing = 0.99\r\n",
	"Score RF with the 25.2552315487 % missing = 0.98\r\n",
	"Score RF+Imp. with the 25.2552315487 % missing = 0.99\r\n",
	"Score RF with the 25.8147979859 % missing = 0.98\r\n",
	"Score RF+Imp. with the 25.8147979859 % missing = 0.99\r\n",
	"Score RF with the 26.347275673 % missing = 0.98\r\n",
	"Score RF+Imp. with the 26.347275673 % missing = 0.99\r\n",
	"Score RF with the 26.8534359334 % missing = 0.98\r\n",
	"Score RF+Imp. with the 26.8534359334 % missing = 0.99\r\n",
	"Score RF with the 27.3326508715 % missing = 0.98\r\n",
	"Score RF+Imp. with the 27.3326508715 % missing = 0.99\r\n",
	"Score RF with the 27.7895579896 % missing = 0.98\r\n",
	"Score RF+Imp. with the 27.7895579896 % missing = 0.99\r\n",
	"Score RF with the 28.2251995305 % missing = 0.98\r\n",
	"Score RF+Imp. with the 28.2251995305 % missing = 0.99\r\n",
	"Score RF with the 28.6352089113 % missing = 0.98\r\n",
	"Score RF+Imp. with the 28.6352089113 % missing = 0.99\r\n",
	"Score RF with the 29.0269296408 % missing = 0.99\r\n",
	"Score RF+Imp. with the 29.0269296408 % missing = 0.99\r\n",
	"Score RF with the 29.3992557303 % missing = 0.99\r\n",
	"Score RF+Imp. with the 29.3992557303 % missing = 0.99\r\n",
	"Score RF with the 29.7540389935 % missing = 0.98\r\n",
	"Score RF+Imp. with the 29.7540389935 % missing = 0.99\r\n",
	"Score RF with the 30.0889495238 % missing = 0.99\r\n",
	"Score RF+Imp. with the 30.0889495238 % missing = 0.99\r\n",
	"Score RF with the 30.4072383537 % missing = 0.99\r\n",
	"Score RF+Imp. with the 30.4072383537 % missing = 0.99\r\n",
	"Score RF with the 30.7117740413 % missing = 0.99\r\n",
	"Score RF+Imp. with the 30.7117740413 % missing = 0.99\r\n",
	"Score RF with the 30.9991270659 % missing = 0.99\r\n",
	"Score RF+Imp. with the 30.9991270659 % missing = 0.99\r\n",
	"Score RF with the 31.2725134 % missing = 0.99\r\n",
	"Score RF+Imp. with the 31.2725134 % missing = 0.99\r\n",
	"Score RF with the 31.5319171071 % missing = 0.99\r\n",
	"Score RF+Imp. with the 31.5319171071 % missing = 0.99\r\n",
	"Score RF with the 31.7783836172 % missing = 0.99\r\n",
	"Score RF+Imp. with the 31.7783836172 % missing = 0.99\r\n",
	"he\r\n"
	]
	}
	],
	"source": [
	"k"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}