Skip to content

Instantly share code, notes, and snippets.

@hirokiky
Created August 1, 2017 09:06
Show Gist options
  • Save hirokiky/6d1eb1fb327e6b215761ef6ac00c22d7 to your computer and use it in GitHub Desktop.
Save hirokiky/6d1eb1fb327e6b215761ef6ac00c22d7 to your computer and use it in GitHub Desktop.
wanted to know what makes high quality wines.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', delimiter=';')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>fixed acidity</th>\n",
" <th>volatile acidity</th>\n",
" <th>citric acid</th>\n",
" <th>residual sugar</th>\n",
" <th>chlorides</th>\n",
" <th>free sulfur dioxide</th>\n",
" <th>total sulfur dioxide</th>\n",
" <th>density</th>\n",
" <th>pH</th>\n",
" <th>sulphates</th>\n",
" <th>alcohol</th>\n",
" <th>quality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7.4</td>\n",
" <td>0.70</td>\n",
" <td>0.00</td>\n",
" <td>1.9</td>\n",
" <td>0.076</td>\n",
" <td>11.0</td>\n",
" <td>34.0</td>\n",
" <td>0.9978</td>\n",
" <td>3.51</td>\n",
" <td>0.56</td>\n",
" <td>9.4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7.8</td>\n",
" <td>0.88</td>\n",
" <td>0.00</td>\n",
" <td>2.6</td>\n",
" <td>0.098</td>\n",
" <td>25.0</td>\n",
" <td>67.0</td>\n",
" <td>0.9968</td>\n",
" <td>3.20</td>\n",
" <td>0.68</td>\n",
" <td>9.8</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7.8</td>\n",
" <td>0.76</td>\n",
" <td>0.04</td>\n",
" <td>2.3</td>\n",
" <td>0.092</td>\n",
" <td>15.0</td>\n",
" <td>54.0</td>\n",
" <td>0.9970</td>\n",
" <td>3.26</td>\n",
" <td>0.65</td>\n",
" <td>9.8</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11.2</td>\n",
" <td>0.28</td>\n",
" <td>0.56</td>\n",
" <td>1.9</td>\n",
" <td>0.075</td>\n",
" <td>17.0</td>\n",
" <td>60.0</td>\n",
" <td>0.9980</td>\n",
" <td>3.16</td>\n",
" <td>0.58</td>\n",
" <td>9.8</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7.4</td>\n",
" <td>0.70</td>\n",
" <td>0.00</td>\n",
" <td>1.9</td>\n",
" <td>0.076</td>\n",
" <td>11.0</td>\n",
" <td>34.0</td>\n",
" <td>0.9978</td>\n",
" <td>3.51</td>\n",
" <td>0.56</td>\n",
" <td>9.4</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" fixed acidity volatile acidity citric acid residual sugar chlorides \\\n",
"0 7.4 0.70 0.00 1.9 0.076 \n",
"1 7.8 0.88 0.00 2.6 0.098 \n",
"2 7.8 0.76 0.04 2.3 0.092 \n",
"3 11.2 0.28 0.56 1.9 0.075 \n",
"4 7.4 0.70 0.00 1.9 0.076 \n",
"\n",
" free sulfur dioxide total sulfur dioxide density pH sulphates \\\n",
"0 11.0 34.0 0.9978 3.51 0.56 \n",
"1 25.0 67.0 0.9968 3.20 0.68 \n",
"2 15.0 54.0 0.9970 3.26 0.65 \n",
"3 17.0 60.0 0.9980 3.16 0.58 \n",
"4 11.0 34.0 0.9978 3.51 0.56 \n",
"\n",
" alcohol quality \n",
"0 9.4 5 \n",
"1 9.8 5 \n",
"2 9.8 5 \n",
"3 9.8 6 \n",
"4 9.4 5 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"X = df.iloc[:, :-1].values\n",
"y = df.iloc[:, -1].values"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.grid_search import GridSearchCV"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"gs = GridSearchCV(RandomForestClassifier(),\n",
" param_grid=[{'n_estimators': [3, 10, 20, 50],\n",
" 'max_depth': [3, 5, 10, 20, 30]}])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"GridSearchCV(cv=None, error_score='raise',\n",
" estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_split=1e-07, min_samples_leaf=1,\n",
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
" n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False),\n",
" fit_params={}, iid=True, n_jobs=1,\n",
" param_grid=[{'max_depth': [3, 5, 10, 20, 30], 'n_estimators': [3, 10, 20, 50]}],\n",
" pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'max_depth': 10, 'n_estimators': 50}"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.6875"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=10, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_split=1e-07, min_samples_leaf=1,\n",
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
" n_estimators=50, n_jobs=1, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs.best_estimator_"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.69374999999999998"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es = gs.best_estimator_\n",
"es.fit(X_train, y_train)\n",
"es.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.07894648, 0.10711908, 0.0643799 , 0.0656545 , 0.07742105,\n",
" 0.06366092, 0.10172709, 0.09202301, 0.067935 , 0.11266989,\n",
" 0.16846308])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es.feature_importances_"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"from sklearn.svm import SVC\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"pipe = Pipeline([('sv', StandardScaler()), ('clf', SVC())])\n",
"gs_svm = GridSearchCV(pipe,\n",
" param_grid=[{'clf__gamma': [0.01, 0.1, 1, 10, 100],\n",
" 'clf__C': [0.01, 0.1, 1, 10, 100]}])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"GridSearchCV(cv=None, error_score='raise',\n",
" estimator=Pipeline(steps=[('sv', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
" decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n",
" max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
" tol=0.001, verbose=False))]),\n",
" fit_params={}, iid=True, n_jobs=1,\n",
" param_grid=[{'clf__gamma': [0.01, 0.1, 1, 10, 100], 'clf__C': [0.01, 0.1, 1, 10, 100]}],\n",
" pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_svm.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.66041666666666665"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_svm.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'clf__C': 1, 'clf__gamma': 1}"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_svm.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"pipe = Pipeline([('sv', StandardScaler()), ('clf', LogisticRegression())])\n",
"gs_lr = GridSearchCV(pipe,\n",
" param_grid=[{'clf__C': [0.01, 0.1, 1, 10, 100]}])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"GridSearchCV(cv=None, error_score='raise',\n",
" estimator=Pipeline(steps=[('sv', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
" penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
" verbose=0, warm_start=False))]),\n",
" fit_params={}, iid=True, n_jobs=1,\n",
" param_grid=[{'clf__C': [0.01, 0.1, 1, 10, 100]}],\n",
" pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_lr.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.58333333333333337"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_lr.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"y_s = np.array([1 if a >= 7 else 0 for a in y])"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"X_s_train, X_s_test, y_s_train, y_s_test = train_test_split(X, y_s, test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"gs = GridSearchCV(RandomForestClassifier(),\n",
" param_grid=[{'n_estimators': [3, 10, 20, 50],\n",
" 'max_depth': [3, 5, 10, 20, 30]}])"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"GridSearchCV(cv=None, error_score='raise',\n",
" estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_split=1e-07, min_samples_leaf=1,\n",
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
" n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False),\n",
" fit_params={}, iid=True, n_jobs=1,\n",
" param_grid=[{'max_depth': [3, 5, 10, 20, 30], 'n_estimators': [3, 10, 20, 50]}],\n",
" pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs.fit(X_s_train, y_s_train)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'max_depth': 10, 'n_estimators': 20}"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.90416666666666667"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"e = gs.best_estimator_\n",
"e.fit(X_s_train, y_s_train)\n",
"e.score(X_s_test, y_s_test)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.06236372, 0.11460061, 0.08839113, 0.06540987, 0.07850471,\n",
" 0.05851349, 0.07462395, 0.09557788, 0.0689122 , 0.12739927,\n",
" 0.16570318])"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"e.feature_importances_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment