Skip to content

Instantly share code, notes, and snippets.

@nomatteus
Created September 15, 2019 14:12
Show Gist options
  • Save nomatteus/4497f3a489522889d081ba978bb95ca1 to your computer and use it in GitHub Desktop.
Save nomatteus/4497f3a489522889d081ba978bb95ca1 to your computer and use it in GitHub Desktop.
Her Code Camp Notebooks
name: example-environment
channels:
- conda-forge
dependencies:
- python
- numpy
- graphviz
- pip:
- nbgitpuller
- sphinx-gallery
- pandas
- matplotlib
- sklearn
- pydotplus
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h3>Building a Cancer Classifier using Random Forest</h3>"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>1- Load The Required Packages</h4>"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"import pandas as pd #data manupilation\n",
"from sklearn.model_selection import train_test_split #splitting the data to train and test\n",
"from sklearn import tree #running a decision tree\n",
"from sklearn.ensemble import RandomForestClassifier #running a random forest\n",
"from sklearn import datasets #saved datasets\n",
"\n",
"from sklearn import metrics #assessing model performance\n",
"from sklearn.metrics import classification_report #assessing model performance\n",
"from sklearn.metrics import confusion_matrix #assessing model performance\n",
"import matplotlib.pyplot as plt #visualize model performance\n",
"\n",
"pd.set_option('display.max_columns', 30) #display all columns in your data"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>2- Load The Data</h4>"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>radius error</th>\n",
" <th>texture error</th>\n",
" <th>perimeter error</th>\n",
" <th>area error</th>\n",
" <th>smoothness error</th>\n",
" <th>compactness error</th>\n",
" <th>concavity error</th>\n",
" <th>concave points error</th>\n",
" <th>symmetry error</th>\n",
" <th>fractal dimension error</th>\n",
" <th>worst radius</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0.27760</td>\n",
" <td>0.3001</td>\n",
" <td>0.14710</td>\n",
" <td>0.2419</td>\n",
" <td>0.07871</td>\n",
" <td>1.0950</td>\n",
" <td>0.9053</td>\n",
" <td>8.589</td>\n",
" <td>153.40</td>\n",
" <td>0.006399</td>\n",
" <td>0.04904</td>\n",
" <td>0.05373</td>\n",
" <td>0.01587</td>\n",
" <td>0.03003</td>\n",
" <td>0.006193</td>\n",
" <td>25.38</td>\n",
" <td>17.33</td>\n",
" <td>184.60</td>\n",
" <td>2019.0</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.11890</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>0.1812</td>\n",
" <td>0.05667</td>\n",
" <td>0.5435</td>\n",
" <td>0.7339</td>\n",
" <td>3.398</td>\n",
" <td>74.08</td>\n",
" <td>0.005225</td>\n",
" <td>0.01308</td>\n",
" <td>0.01860</td>\n",
" <td>0.01340</td>\n",
" <td>0.01389</td>\n",
" <td>0.003532</td>\n",
" <td>24.99</td>\n",
" <td>23.41</td>\n",
" <td>158.80</td>\n",
" <td>1956.0</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.1860</td>\n",
" <td>0.2750</td>\n",
" <td>0.08902</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0.15990</td>\n",
" <td>0.1974</td>\n",
" <td>0.12790</td>\n",
" <td>0.2069</td>\n",
" <td>0.05999</td>\n",
" <td>0.7456</td>\n",
" <td>0.7869</td>\n",
" <td>4.585</td>\n",
" <td>94.03</td>\n",
" <td>0.006150</td>\n",
" <td>0.04006</td>\n",
" <td>0.03832</td>\n",
" <td>0.02058</td>\n",
" <td>0.02250</td>\n",
" <td>0.004571</td>\n",
" <td>23.57</td>\n",
" <td>25.53</td>\n",
" <td>152.50</td>\n",
" <td>1709.0</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.2430</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0.28390</td>\n",
" <td>0.2414</td>\n",
" <td>0.10520</td>\n",
" <td>0.2597</td>\n",
" <td>0.09744</td>\n",
" <td>0.4956</td>\n",
" <td>1.1560</td>\n",
" <td>3.445</td>\n",
" <td>27.23</td>\n",
" <td>0.009110</td>\n",
" <td>0.07458</td>\n",
" <td>0.05661</td>\n",
" <td>0.01867</td>\n",
" <td>0.05963</td>\n",
" <td>0.009208</td>\n",
" <td>14.91</td>\n",
" <td>26.50</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.17300</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0.13280</td>\n",
" <td>0.1980</td>\n",
" <td>0.10430</td>\n",
" <td>0.1809</td>\n",
" <td>0.05883</td>\n",
" <td>0.7572</td>\n",
" <td>0.7813</td>\n",
" <td>5.438</td>\n",
" <td>94.44</td>\n",
" <td>0.011490</td>\n",
" <td>0.02461</td>\n",
" <td>0.05688</td>\n",
" <td>0.01885</td>\n",
" <td>0.01756</td>\n",
" <td>0.005115</td>\n",
" <td>22.54</td>\n",
" <td>16.67</td>\n",
" <td>152.20</td>\n",
" <td>1575.0</td>\n",
" <td>0.1374</td>\n",
" <td>0.2050</td>\n",
" <td>0.4000</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
"3 11.42 20.38 77.58 386.1 0.14250 \n",
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"0 0.27760 0.3001 0.14710 0.2419 \n",
"1 0.07864 0.0869 0.07017 0.1812 \n",
"2 0.15990 0.1974 0.12790 0.2069 \n",
"3 0.28390 0.2414 0.10520 0.2597 \n",
"4 0.13280 0.1980 0.10430 0.1809 \n",
"\n",
" mean fractal dimension radius error texture error perimeter error \\\n",
"0 0.07871 1.0950 0.9053 8.589 \n",
"1 0.05667 0.5435 0.7339 3.398 \n",
"2 0.05999 0.7456 0.7869 4.585 \n",
"3 0.09744 0.4956 1.1560 3.445 \n",
"4 0.05883 0.7572 0.7813 5.438 \n",
"\n",
" area error smoothness error compactness error concavity error \\\n",
"0 153.40 0.006399 0.04904 0.05373 \n",
"1 74.08 0.005225 0.01308 0.01860 \n",
"2 94.03 0.006150 0.04006 0.03832 \n",
"3 27.23 0.009110 0.07458 0.05661 \n",
"4 94.44 0.011490 0.02461 0.05688 \n",
"\n",
" concave points error symmetry error fractal dimension error worst radius \\\n",
"0 0.01587 0.03003 0.006193 25.38 \n",
"1 0.01340 0.01389 0.003532 24.99 \n",
"2 0.02058 0.02250 0.004571 23.57 \n",
"3 0.01867 0.05963 0.009208 14.91 \n",
"4 0.01885 0.01756 0.005115 22.54 \n",
"\n",
" worst texture worst perimeter worst area worst smoothness worst compactness \\\n",
"0 17.33 184.60 2019.0 0.1622 0.6656 \n",
"1 23.41 158.80 1956.0 0.1238 0.1866 \n",
"2 25.53 152.50 1709.0 0.1444 0.4245 \n",
"3 26.50 98.87 567.7 0.2098 0.8663 \n",
"4 16.67 152.20 1575.0 0.1374 0.2050 \n",
"\n",
" worst concavity worst concave points worst symmetry worst fractal dimension \n",
"0 0.7119 0.2654 0.4601 0.11890 \n",
"1 0.2416 0.1860 0.2750 0.08902 \n",
"2 0.4504 0.2430 0.3613 0.08758 \n",
"3 0.6869 0.2575 0.6638 0.17300 \n",
"4 0.4000 0.1625 0.2364 0.07678 "
]
},
"execution_count": 4,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"cancer = datasets.load_breast_cancer()\n",
"X=pd.DataFrame(cancer.data,columns=[cancer.feature_names]) #define your features\n",
"Y=pd.Series(cancer.target) #define the target variable\n",
"X.head() #view the first few rows from your features"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'X' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-2-581203140a6e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#print the dimensions of the dataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'X' is not defined"
]
}
],
"source": [
"#print the dimensions of the dataset\n",
"print(X.shape)\n",
"print(Y.value_counts)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"MultiIndex([( 'mean radius',),\n",
" ( 'mean texture',),\n",
" ( 'mean perimeter',),\n",
" ( 'mean area',),\n",
" ( 'mean smoothness',),\n",
" ( 'mean compactness',),\n",
" ( 'mean concavity',),\n",
" ( 'mean concave points',),\n",
" ( 'mean symmetry',),\n",
" ( 'mean fractal dimension',),\n",
" ( 'radius error',),\n",
" ( 'texture error',),\n",
" ( 'perimeter error',),\n",
" ( 'area error',),\n",
" ( 'smoothness error',),\n",
" ( 'compactness error',),\n",
" ( 'concavity error',),\n",
" ( 'concave points error',),\n",
" ( 'symmetry error',),\n",
" ('fractal dimension error',),\n",
" ( 'worst radius',),\n",
" ( 'worst texture',),\n",
" ( 'worst perimeter',),\n",
" ( 'worst area',),\n",
" ( 'worst smoothness',),\n",
" ( 'worst compactness',),\n",
" ( 'worst concavity',),\n",
" ( 'worst concave points',),\n",
" ( 'worst symmetry',),\n",
" ('worst fractal dimension',)],\n",
" )"
]
},
"execution_count": 6,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's look at column names\n",
"X.columns"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>radius error</th>\n",
" <th>texture error</th>\n",
" <th>perimeter error</th>\n",
" <th>area error</th>\n",
" <th>smoothness error</th>\n",
" <th>compactness error</th>\n",
" <th>concavity error</th>\n",
" <th>concave points error</th>\n",
" <th>symmetry error</th>\n",
" <th>fractal dimension error</th>\n",
" <th>worst radius</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>count</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" <td>569.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean</td>\n",
" <td>14.127292</td>\n",
" <td>19.289649</td>\n",
" <td>91.969033</td>\n",
" <td>654.889104</td>\n",
" <td>0.096360</td>\n",
" <td>0.104341</td>\n",
" <td>0.088799</td>\n",
" <td>0.048919</td>\n",
" <td>0.181162</td>\n",
" <td>0.062798</td>\n",
" <td>0.405172</td>\n",
" <td>1.216853</td>\n",
" <td>2.866059</td>\n",
" <td>40.337079</td>\n",
" <td>0.007041</td>\n",
" <td>0.025478</td>\n",
" <td>0.031894</td>\n",
" <td>0.011796</td>\n",
" <td>0.020542</td>\n",
" <td>0.003795</td>\n",
" <td>16.269190</td>\n",
" <td>25.677223</td>\n",
" <td>107.261213</td>\n",
" <td>880.583128</td>\n",
" <td>0.132369</td>\n",
" <td>0.254265</td>\n",
" <td>0.272188</td>\n",
" <td>0.114606</td>\n",
" <td>0.290076</td>\n",
" <td>0.083946</td>\n",
" </tr>\n",
" <tr>\n",
" <td>std</td>\n",
" <td>3.524049</td>\n",
" <td>4.301036</td>\n",
" <td>24.298981</td>\n",
" <td>351.914129</td>\n",
" <td>0.014064</td>\n",
" <td>0.052813</td>\n",
" <td>0.079720</td>\n",
" <td>0.038803</td>\n",
" <td>0.027414</td>\n",
" <td>0.007060</td>\n",
" <td>0.277313</td>\n",
" <td>0.551648</td>\n",
" <td>2.021855</td>\n",
" <td>45.491006</td>\n",
" <td>0.003003</td>\n",
" <td>0.017908</td>\n",
" <td>0.030186</td>\n",
" <td>0.006170</td>\n",
" <td>0.008266</td>\n",
" <td>0.002646</td>\n",
" <td>4.833242</td>\n",
" <td>6.146258</td>\n",
" <td>33.602542</td>\n",
" <td>569.356993</td>\n",
" <td>0.022832</td>\n",
" <td>0.157336</td>\n",
" <td>0.208624</td>\n",
" <td>0.065732</td>\n",
" <td>0.061867</td>\n",
" <td>0.018061</td>\n",
" </tr>\n",
" <tr>\n",
" <td>min</td>\n",
" <td>6.981000</td>\n",
" <td>9.710000</td>\n",
" <td>43.790000</td>\n",
" <td>143.500000</td>\n",
" <td>0.052630</td>\n",
" <td>0.019380</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.106000</td>\n",
" <td>0.049960</td>\n",
" <td>0.111500</td>\n",
" <td>0.360200</td>\n",
" <td>0.757000</td>\n",
" <td>6.802000</td>\n",
" <td>0.001713</td>\n",
" <td>0.002252</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.007882</td>\n",
" <td>0.000895</td>\n",
" <td>7.930000</td>\n",
" <td>12.020000</td>\n",
" <td>50.410000</td>\n",
" <td>185.200000</td>\n",
" <td>0.071170</td>\n",
" <td>0.027290</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.156500</td>\n",
" <td>0.055040</td>\n",
" </tr>\n",
" <tr>\n",
" <td>25%</td>\n",
" <td>11.700000</td>\n",
" <td>16.170000</td>\n",
" <td>75.170000</td>\n",
" <td>420.300000</td>\n",
" <td>0.086370</td>\n",
" <td>0.064920</td>\n",
" <td>0.029560</td>\n",
" <td>0.020310</td>\n",
" <td>0.161900</td>\n",
" <td>0.057700</td>\n",
" <td>0.232400</td>\n",
" <td>0.833900</td>\n",
" <td>1.606000</td>\n",
" <td>17.850000</td>\n",
" <td>0.005169</td>\n",
" <td>0.013080</td>\n",
" <td>0.015090</td>\n",
" <td>0.007638</td>\n",
" <td>0.015160</td>\n",
" <td>0.002248</td>\n",
" <td>13.010000</td>\n",
" <td>21.080000</td>\n",
" <td>84.110000</td>\n",
" <td>515.300000</td>\n",
" <td>0.116600</td>\n",
" <td>0.147200</td>\n",
" <td>0.114500</td>\n",
" <td>0.064930</td>\n",
" <td>0.250400</td>\n",
" <td>0.071460</td>\n",
" </tr>\n",
" <tr>\n",
" <td>50%</td>\n",
" <td>13.370000</td>\n",
" <td>18.840000</td>\n",
" <td>86.240000</td>\n",
" <td>551.100000</td>\n",
" <td>0.095870</td>\n",
" <td>0.092630</td>\n",
" <td>0.061540</td>\n",
" <td>0.033500</td>\n",
" <td>0.179200</td>\n",
" <td>0.061540</td>\n",
" <td>0.324200</td>\n",
" <td>1.108000</td>\n",
" <td>2.287000</td>\n",
" <td>24.530000</td>\n",
" <td>0.006380</td>\n",
" <td>0.020450</td>\n",
" <td>0.025890</td>\n",
" <td>0.010930</td>\n",
" <td>0.018730</td>\n",
" <td>0.003187</td>\n",
" <td>14.970000</td>\n",
" <td>25.410000</td>\n",
" <td>97.660000</td>\n",
" <td>686.500000</td>\n",
" <td>0.131300</td>\n",
" <td>0.211900</td>\n",
" <td>0.226700</td>\n",
" <td>0.099930</td>\n",
" <td>0.282200</td>\n",
" <td>0.080040</td>\n",
" </tr>\n",
" <tr>\n",
" <td>75%</td>\n",
" <td>15.780000</td>\n",
" <td>21.800000</td>\n",
" <td>104.100000</td>\n",
" <td>782.700000</td>\n",
" <td>0.105300</td>\n",
" <td>0.130400</td>\n",
" <td>0.130700</td>\n",
" <td>0.074000</td>\n",
" <td>0.195700</td>\n",
" <td>0.066120</td>\n",
" <td>0.478900</td>\n",
" <td>1.474000</td>\n",
" <td>3.357000</td>\n",
" <td>45.190000</td>\n",
" <td>0.008146</td>\n",
" <td>0.032450</td>\n",
" <td>0.042050</td>\n",
" <td>0.014710</td>\n",
" <td>0.023480</td>\n",
" <td>0.004558</td>\n",
" <td>18.790000</td>\n",
" <td>29.720000</td>\n",
" <td>125.400000</td>\n",
" <td>1084.000000</td>\n",
" <td>0.146000</td>\n",
" <td>0.339100</td>\n",
" <td>0.382900</td>\n",
" <td>0.161400</td>\n",
" <td>0.317900</td>\n",
" <td>0.092080</td>\n",
" </tr>\n",
" <tr>\n",
" <td>max</td>\n",
" <td>28.110000</td>\n",
" <td>39.280000</td>\n",
" <td>188.500000</td>\n",
" <td>2501.000000</td>\n",
" <td>0.163400</td>\n",
" <td>0.345400</td>\n",
" <td>0.426800</td>\n",
" <td>0.201200</td>\n",
" <td>0.304000</td>\n",
" <td>0.097440</td>\n",
" <td>2.873000</td>\n",
" <td>4.885000</td>\n",
" <td>21.980000</td>\n",
" <td>542.200000</td>\n",
" <td>0.031130</td>\n",
" <td>0.135400</td>\n",
" <td>0.396000</td>\n",
" <td>0.052790</td>\n",
" <td>0.078950</td>\n",
" <td>0.029840</td>\n",
" <td>36.040000</td>\n",
" <td>49.540000</td>\n",
" <td>251.200000</td>\n",
" <td>4254.000000</td>\n",
" <td>0.222600</td>\n",
" <td>1.058000</td>\n",
" <td>1.252000</td>\n",
" <td>0.291000</td>\n",
" <td>0.663800</td>\n",
" <td>0.207500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"count 569.000000 569.000000 569.000000 569.000000 569.000000 \n",
"mean 14.127292 19.289649 91.969033 654.889104 0.096360 \n",
"std 3.524049 4.301036 24.298981 351.914129 0.014064 \n",
"min 6.981000 9.710000 43.790000 143.500000 0.052630 \n",
"25% 11.700000 16.170000 75.170000 420.300000 0.086370 \n",
"50% 13.370000 18.840000 86.240000 551.100000 0.095870 \n",
"75% 15.780000 21.800000 104.100000 782.700000 0.105300 \n",
"max 28.110000 39.280000 188.500000 2501.000000 0.163400 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 0.104341 0.088799 0.048919 0.181162 \n",
"std 0.052813 0.079720 0.038803 0.027414 \n",
"min 0.019380 0.000000 0.000000 0.106000 \n",
"25% 0.064920 0.029560 0.020310 0.161900 \n",
"50% 0.092630 0.061540 0.033500 0.179200 \n",
"75% 0.130400 0.130700 0.074000 0.195700 \n",
"max 0.345400 0.426800 0.201200 0.304000 \n",
"\n",
" mean fractal dimension radius error texture error perimeter error \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 0.062798 0.405172 1.216853 2.866059 \n",
"std 0.007060 0.277313 0.551648 2.021855 \n",
"min 0.049960 0.111500 0.360200 0.757000 \n",
"25% 0.057700 0.232400 0.833900 1.606000 \n",
"50% 0.061540 0.324200 1.108000 2.287000 \n",
"75% 0.066120 0.478900 1.474000 3.357000 \n",
"max 0.097440 2.873000 4.885000 21.980000 \n",
"\n",
" area error smoothness error compactness error concavity error \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 40.337079 0.007041 0.025478 0.031894 \n",
"std 45.491006 0.003003 0.017908 0.030186 \n",
"min 6.802000 0.001713 0.002252 0.000000 \n",
"25% 17.850000 0.005169 0.013080 0.015090 \n",
"50% 24.530000 0.006380 0.020450 0.025890 \n",
"75% 45.190000 0.008146 0.032450 0.042050 \n",
"max 542.200000 0.031130 0.135400 0.396000 \n",
"\n",
" concave points error symmetry error fractal dimension error \\\n",
"count 569.000000 569.000000 569.000000 \n",
"mean 0.011796 0.020542 0.003795 \n",
"std 0.006170 0.008266 0.002646 \n",
"min 0.000000 0.007882 0.000895 \n",
"25% 0.007638 0.015160 0.002248 \n",
"50% 0.010930 0.018730 0.003187 \n",
"75% 0.014710 0.023480 0.004558 \n",
"max 0.052790 0.078950 0.029840 \n",
"\n",
" worst radius worst texture worst perimeter worst area \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 16.269190 25.677223 107.261213 880.583128 \n",
"std 4.833242 6.146258 33.602542 569.356993 \n",
"min 7.930000 12.020000 50.410000 185.200000 \n",
"25% 13.010000 21.080000 84.110000 515.300000 \n",
"50% 14.970000 25.410000 97.660000 686.500000 \n",
"75% 18.790000 29.720000 125.400000 1084.000000 \n",
"max 36.040000 49.540000 251.200000 4254.000000 \n",
"\n",
" worst smoothness worst compactness worst concavity worst concave points \\\n",
"count 569.000000 569.000000 569.000000 569.000000 \n",
"mean 0.132369 0.254265 0.272188 0.114606 \n",
"std 0.022832 0.157336 0.208624 0.065732 \n",
"min 0.071170 0.027290 0.000000 0.000000 \n",
"25% 0.116600 0.147200 0.114500 0.064930 \n",
"50% 0.131300 0.211900 0.226700 0.099930 \n",
"75% 0.146000 0.339100 0.382900 0.161400 \n",
"max 0.222600 1.058000 1.252000 0.291000 \n",
"\n",
" worst symmetry worst fractal dimension \n",
"count 569.000000 569.000000 \n",
"mean 0.290076 0.083946 \n",
"std 0.061867 0.018061 \n",
"min 0.156500 0.055040 \n",
"25% 0.250400 0.071460 \n",
"50% 0.282200 0.080040 \n",
"75% 0.317900 0.092080 \n",
"max 0.663800 0.207500 "
]
},
"execution_count": 7,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's summarize the data\n",
"X.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>3- Split to Train and Test</h4>"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(398, 30)\n",
"(171, 30)\n",
"(398,)\n",
"(171,)\n"
]
}
],
"source": [
"#split the data to 70% train and 30% test\n",
"x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=42)\n",
"\n",
"print(x_train.shape)\n",
"print(x_test.shape)\n",
"print(y_train.shape)\n",
"print(y_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>4- Train your model: Random Forest</h4>"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9849246231155779"
]
},
"execution_count": 9,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"rf_model = RandomForestClassifier(max_depth=3,n_estimators=15) #define the model\n",
"rf_model.fit(x_train, y_train) #fit the model (train)\n",
"rf_model.score(x_train,y_train) #predict on new observations\n",
"\n",
"#what is the accuracy of this model?"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"Let's visualize this tree! (https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n",
" \"(https://pypi.org/project/six/).\", DeprecationWarning)\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 10,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#select which tree do you want to visualize\n",
"selected_tree=2\n",
"\n",
"from sklearn.externals.six import StringIO\n",
"from IPython.display import Image\n",
"from sklearn.tree import export_graphviz\n",
"import pydotplus\n",
"dot_data2 = StringIO()\n",
"export_graphviz(rf_model.estimators_[selected_tree],\n",
" out_file=dot_data2,\n",
" filled=True,\n",
" precision=2,\n",
" feature_names=x_train.columns,\n",
" rounded=True)\n",
"graph = pydotplus.graph_from_dot_data(dot_data2.getvalue())\n",
"Image(graph.create_png())"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>5- Predict!</h4>"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>radius error</th>\n",
" <th>texture error</th>\n",
" <th>perimeter error</th>\n",
" <th>area error</th>\n",
" <th>smoothness error</th>\n",
" <th>compactness error</th>\n",
" <th>concavity error</th>\n",
" <th>concave points error</th>\n",
" <th>symmetry error</th>\n",
" <th>fractal dimension error</th>\n",
" <th>worst radius</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>204</td>\n",
" <td>12.47</td>\n",
" <td>18.6</td>\n",
" <td>81.09</td>\n",
" <td>481.9</td>\n",
" <td>0.09965</td>\n",
" <td>0.1058</td>\n",
" <td>0.08005</td>\n",
" <td>0.03821</td>\n",
" <td>0.1925</td>\n",
" <td>0.06373</td>\n",
" <td>0.3961</td>\n",
" <td>1.044</td>\n",
" <td>2.497</td>\n",
" <td>30.29</td>\n",
" <td>0.006953</td>\n",
" <td>0.01911</td>\n",
" <td>0.02701</td>\n",
" <td>0.01037</td>\n",
" <td>0.01782</td>\n",
" <td>0.003586</td>\n",
" <td>14.97</td>\n",
" <td>24.64</td>\n",
" <td>96.05</td>\n",
" <td>677.9</td>\n",
" <td>0.1426</td>\n",
" <td>0.2378</td>\n",
" <td>0.2671</td>\n",
" <td>0.1015</td>\n",
" <td>0.3014</td>\n",
" <td>0.0875</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"204 12.47 18.6 81.09 481.9 0.09965 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"204 0.1058 0.08005 0.03821 0.1925 \n",
"\n",
" mean fractal dimension radius error texture error perimeter error \\\n",
"204 0.06373 0.3961 1.044 2.497 \n",
"\n",
" area error smoothness error compactness error concavity error \\\n",
"204 30.29 0.006953 0.01911 0.02701 \n",
"\n",
" concave points error symmetry error fractal dimension error worst radius \\\n",
"204 0.01037 0.01782 0.003586 14.97 \n",
"\n",
" worst texture worst perimeter worst area worst smoothness \\\n",
"204 24.64 96.05 677.9 0.1426 \n",
"\n",
" worst compactness worst concavity worst concave points worst symmetry \\\n",
"204 0.2378 0.2671 0.1015 0.3014 \n",
"\n",
" worst fractal dimension \n",
"204 0.0875 "
]
},
"execution_count": 11,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's pull information from one patient from the test set\n",
"patient1_test=(x_test.iloc[0:1,:])\n",
"patient1_test"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1])"
]
},
"execution_count": 12,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#what would our model predict? Malignant or Benign?\n",
"rf_model.predict(patient1_test)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.09455427, 0.90544573]])"
]
},
"execution_count": 13,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#can we predict the probability of a patient being malignant or benign?\n",
"rf_model.predict_proba(patient1_test)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,\n",
" 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,\n",
" 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,\n",
" 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,\n",
" 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,\n",
" 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])"
]
},
"execution_count": 14,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#Can we predict multiple patients at once?\n",
"rf_model.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.09455427, 0.90544573],\n",
" [0.98808391, 0.01191609],\n",
" [0.98808391, 0.01191609],\n",
" [0.02520654, 0.97479346],\n",
" [0.01285535, 0.98714465],\n",
" [0.98557183, 0.01442817],\n",
" [0.99001627, 0.00998373],\n",
" [0.85784801, 0.14215199],\n",
" [0.79946919, 0.20053081],\n",
" [0.01329979, 0.98670021]])"
]
},
"execution_count": 15,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#can we get the probability of each test case being malignant or benign? (display the first 10 lines)\n",
"rf_model.predict_proba(x_test)[0:10]\n",
"\n",
"#do you see how the 0 and 1 were generated in the previous command?"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>6- How well did we predict?</h4>"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9649122807017544"
]
},
"execution_count": 16,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#what is the accuracy of the model on the test set?\n",
"rf_model.score(x_test,y_test)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>predicted benign</th>\n",
" <th>predicted malignant</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>benign</td>\n",
" <td>59</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>malignant</td>\n",
" <td>2</td>\n",
" <td>106</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" predicted benign predicted malignant\n",
"benign 59 4\n",
"malignant 2 106"
]
},
"execution_count": 17,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#let's generate a confusion matrix!\n",
"pd.DataFrame(confusion_matrix(y_test,rf_model.predict(x_test)),index=['benign','malignant'],columns=['predicted benign','predicted malignant'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>7- Identifying the important questions!</h4>"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"#let's create a data frame that contains information about how important each question is in generating the correct prediction!\n",
"feature_importances = pd.DataFrame(rf_model.feature_importances_,\n",
" index = x_train.columns,\n",
" columns=['importance']).sort_values('importance', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>importance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>worst concave points</td>\n",
" <td>0.191349</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst radius</td>\n",
" <td>0.144533</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean concave points</td>\n",
" <td>0.111866</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst concavity</td>\n",
" <td>0.107259</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean radius</td>\n",
" <td>0.092839</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean area</td>\n",
" <td>0.071659</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst area</td>\n",
" <td>0.065903</td>\n",
" </tr>\n",
" <tr>\n",
" <td>area error</td>\n",
" <td>0.041340</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst perimeter</td>\n",
" <td>0.026521</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean texture</td>\n",
" <td>0.023976</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst compactness</td>\n",
" <td>0.022208</td>\n",
" </tr>\n",
" <tr>\n",
" <td>concavity error</td>\n",
" <td>0.019147</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst symmetry</td>\n",
" <td>0.018041</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst smoothness</td>\n",
" <td>0.013202</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean concavity</td>\n",
" <td>0.012525</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst texture</td>\n",
" <td>0.011488</td>\n",
" </tr>\n",
" <tr>\n",
" <td>perimeter error</td>\n",
" <td>0.004753</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean compactness</td>\n",
" <td>0.004630</td>\n",
" </tr>\n",
" <tr>\n",
" <td>fractal dimension error</td>\n",
" <td>0.003869</td>\n",
" </tr>\n",
" <tr>\n",
" <td>radius error</td>\n",
" <td>0.003524</td>\n",
" </tr>\n",
" <tr>\n",
" <td>concave points error</td>\n",
" <td>0.003100</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean perimeter</td>\n",
" <td>0.002389</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean smoothness</td>\n",
" <td>0.001754</td>\n",
" </tr>\n",
" <tr>\n",
" <td>symmetry error</td>\n",
" <td>0.001535</td>\n",
" </tr>\n",
" <tr>\n",
" <td>compactness error</td>\n",
" <td>0.000589</td>\n",
" </tr>\n",
" <tr>\n",
" <td>smoothness error</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>texture error</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean fractal dimension</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean symmetry</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>worst fractal dimension</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" importance\n",
"worst concave points 0.191349\n",
"worst radius 0.144533\n",
"mean concave points 0.111866\n",
"worst concavity 0.107259\n",
"mean radius 0.092839\n",
"mean area 0.071659\n",
"worst area 0.065903\n",
"area error 0.041340\n",
"worst perimeter 0.026521\n",
"mean texture 0.023976\n",
"worst compactness 0.022208\n",
"concavity error 0.019147\n",
"worst symmetry 0.018041\n",
"worst smoothness 0.013202\n",
"mean concavity 0.012525\n",
"worst texture 0.011488\n",
"perimeter error 0.004753\n",
"mean compactness 0.004630\n",
"fractal dimension error 0.003869\n",
"radius error 0.003524\n",
"concave points error 0.003100\n",
"mean perimeter 0.002389\n",
"mean smoothness 0.001754\n",
"symmetry error 0.001535\n",
"compactness error 0.000589\n",
"smoothness error 0.000000\n",
"texture error 0.000000\n",
"mean fractal dimension 0.000000\n",
"mean symmetry 0.000000\n",
"worst fractal dimension 0.000000"
]
},
"execution_count": 19,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#display the dataframe. Which questions do you think are important?\n",
"feature_importances"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<h4>8- Let's build another model with less features!</h4>"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>worst perimeter</th>\n",
" <th>worst concave points</th>\n",
" <th>worst radius</th>\n",
" <th>mean concave points</th>\n",
" <th>worst concavity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>184.60</td>\n",
" <td>0.2654</td>\n",
" <td>25.38</td>\n",
" <td>0.14710</td>\n",
" <td>0.7119</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>158.80</td>\n",
" <td>0.1860</td>\n",
" <td>24.99</td>\n",
" <td>0.07017</td>\n",
" <td>0.2416</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>152.50</td>\n",
" <td>0.2430</td>\n",
" <td>23.57</td>\n",
" <td>0.12790</td>\n",
" <td>0.4504</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>98.87</td>\n",
" <td>0.2575</td>\n",
" <td>14.91</td>\n",
" <td>0.10520</td>\n",
" <td>0.6869</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>152.20</td>\n",
" <td>0.1625</td>\n",
" <td>22.54</td>\n",
" <td>0.10430</td>\n",
" <td>0.4000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" worst perimeter worst concave points worst radius mean concave points \\\n",
"0 184.60 0.2654 25.38 0.14710 \n",
"1 158.80 0.1860 24.99 0.07017 \n",
"2 152.50 0.2430 23.57 0.12790 \n",
"3 98.87 0.2575 14.91 0.10520 \n",
"4 152.20 0.1625 22.54 0.10430 \n",
"\n",
" worst concavity \n",
"0 0.7119 \n",
"1 0.2416 \n",
"2 0.4504 \n",
"3 0.6869 \n",
"4 0.4000 "
]
},
"execution_count": 20,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#subset the questions we are interested in\n",
"X_reduced=X[['worst perimeter','worst concave points','worst radius','mean concave points','worst concavity']] #define your features\n",
"Y=pd.Series(cancer.target) #define the target\n",
"X_reduced.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"#split into train and test\n",
"x_train,x_test,y_train,y_test = train_test_split(X_reduced,Y,test_size=0.3,random_state=42)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>worst perimeter</th>\n",
" <th>worst concave points</th>\n",
" <th>worst radius</th>\n",
" <th>mean concave points</th>\n",
" <th>worst concavity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>204</td>\n",
" <td>96.05</td>\n",
" <td>0.10150</td>\n",
" <td>14.97</td>\n",
" <td>0.03821</td>\n",
" <td>0.2671</td>\n",
" </tr>\n",
" <tr>\n",
" <td>70</td>\n",
" <td>165.90</td>\n",
" <td>0.17890</td>\n",
" <td>24.86</td>\n",
" <td>0.07951</td>\n",
" <td>0.2687</td>\n",
" </tr>\n",
" <tr>\n",
" <td>131</td>\n",
" <td>124.90</td>\n",
" <td>0.15140</td>\n",
" <td>19.26</td>\n",
" <td>0.08087</td>\n",
" <td>0.3791</td>\n",
" </tr>\n",
" <tr>\n",
" <td>431</td>\n",
" <td>89.61</td>\n",
" <td>0.07370</td>\n",
" <td>12.88</td>\n",
" <td>0.02799</td>\n",
" <td>0.2403</td>\n",
" </tr>\n",
" <tr>\n",
" <td>540</td>\n",
" <td>78.78</td>\n",
" <td>0.06918</td>\n",
" <td>12.26</td>\n",
" <td>0.02594</td>\n",
" <td>0.1797</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" worst perimeter worst concave points worst radius mean concave points \\\n",
"204 96.05 0.10150 14.97 0.03821 \n",
"70 165.90 0.17890 24.86 0.07951 \n",
"131 124.90 0.15140 19.26 0.08087 \n",
"431 89.61 0.07370 12.88 0.02799 \n",
"540 78.78 0.06918 12.26 0.02594 \n",
"\n",
" worst concavity \n",
"204 0.2671 \n",
"70 0.2687 \n",
"131 0.3791 \n",
"431 0.2403 \n",
"540 0.1797 "
]
},
"execution_count": 24,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"x_test.head()\n",
"# y_test.head()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.9623115577889447"
]
},
"execution_count": 20,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#train a new model!\n",
"rf_model = RandomForestClassifier(max_depth=3,n_estimators=15) #define the model\n",
"rf_model.fit(x_train, y_train) #fit the model (train)\n",
"rf_model.score(x_train,y_train) #predict on new observations\n",
"\n",
"#what is the accuracy of this model?"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
" warnings.warn(msg, category=DeprecationWarning)\n"
]
},
{
"data": {
"text/plain": [
"['cancer_classifier.pkl']"
]
},
"execution_count": 21,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#save the model!\n",
"from sklearn.externals import joblib\n",
"\n",
"joblib.dump(rf_model, \"cancer_classifier.pkl\") #save the whole model into a file to be used later\n",
"\n",
"#to load the model next time we just need to do:\n",
"#classifer = joblib.load(\"model.pkl\")\n",
"#classifer.predict(newobs)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"<center><h3>Congratulations! You have built your first classifier!</h3></center>\n",
"<center><h5>www.thecodinghive.com</h5></center>"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (system-wide)",
"language": "python",
"metadata": {
"cocalc": {
"description": "Python 3 programming language",
"priority": 100,
"url": "https://www.python.org/"
}
},
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment