Last active
August 24, 2016 19:04
-
-
Save ivaninkv/cf817b1f25d57a5006d4761113b5d231 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# импортируем библиотеки\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler, PolynomialFeatures\n", | |
"from sklearn.ensemble import GradientBoostingClassifier\n", | |
"from sklearn.metrics import f1_score\n", | |
"from sklearn.cross_validation import train_test_split, cross_val_score\n", | |
"from sklearn.grid_search import GridSearchCV\n", | |
"import xgboost as xgb\n", | |
"import seaborn as sns\n", | |
"import matplotlib.pyplot as plt\n", | |
"%matplotlib inline\n", | |
"import pandas_profiling as pf\n", | |
"import warnings\n", | |
"warnings.filterwarnings('once') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"\"\\nto_drop_cat = ['AHSCOL', 'AREORGN', 'AUNTYPE', 'PENATVTY', 'PRCITSHP', 'TAXINC']\\nto_drop_num = ['AHRSPAY', 'CAPGAIN', 'CAPLOSS']\\ntrain.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\\ntest.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\\n\"" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# грузим\n", | |
"train = pd.read_csv('train.csv')\n", | |
"train.ix[train.CLASS == '50000+', 'CLASS'] = 1\n", | |
"train.ix[train.CLASS == '-50000', 'CLASS'] = 0\n", | |
"\n", | |
"test = pd.read_csv('test.csv', index_col='Id')\n", | |
"\n", | |
"# дропаем столбец AWKSTAT в нем 1 значение в трейне и в тесте\n", | |
"train.drop(['AWKSTAT', 'WKSWORK'], inplace=True, axis=1)\n", | |
"test.drop(['AWKSTAT', 'WKSWORK'], inplace=True, axis=1)\n", | |
"\n", | |
"train.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']] = train.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']].applymap(str)\n", | |
"test.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']] = test.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']].applymap(str)\n", | |
"\n", | |
"to_str = test.drop(train._get_numeric_data().columns, axis=1).columns\n", | |
"train.ix[:, to_str] = train.ix[:, to_str].applymap(str)\n", | |
"test.ix[:, to_str] = test.ix[:, to_str].applymap(str)\n", | |
"\n", | |
"'''\n", | |
"to_drop_cat = ['AHSCOL', 'AREORGN', 'AUNTYPE', 'PENATVTY', 'PRCITSHP', 'TAXINC']\n", | |
"to_drop_num = ['AHRSPAY', 'CAPGAIN', 'CAPLOSS']\n", | |
"train.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\n", | |
"test.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\n", | |
"'''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"False\n", | |
"False\n" | |
] | |
} | |
], | |
"source": [ | |
"# NaN\n", | |
"print train.isnull().any().any()\n", | |
"print test.isnull().any().any()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# исследуем данные\n", | |
"def get_uniq(df, only_num = True): \n", | |
" if only_num == True:\n", | |
" df = df._get_numeric_data()\n", | |
" d = {}\n", | |
" for i in df:\n", | |
" d[i] = df[i].nunique() \n", | |
" res = pd.DataFrame(d.items()) \n", | |
" res.columns = ['fieldname', 'uniq_count']\n", | |
" res.index = res.ix[:, 'fieldname']\n", | |
" res.drop(['fieldname'], axis=1, inplace=True)\n", | |
" res['type'] = df.dtypes\n", | |
" return res.sort_values(by='uniq_count') \n", | |
"\n", | |
"def get_corr(df):\n", | |
" d = {}\n", | |
" df_num = df._get_numeric_data()\n", | |
" for i in df_num:\n", | |
" res = df_num.drop(i, axis=1).corrwith(df_num[i])\n", | |
" field_name = np.argmax(np.abs(res))\n", | |
" if np.max(np.abs(res)) == np.max(res):\n", | |
" corr_value = np.max(res)\n", | |
" else:\n", | |
" corr_value = np.min(res)\n", | |
" d[i + ' - ' + field_name] = corr_value\n", | |
" res = pd.DataFrame(d.items())\n", | |
" res.columns = ['pairs_name', 'corr_value']\n", | |
" res.index = res.ix[:, 'pairs_name']\n", | |
" res.drop(['pairs_name'], axis=1, inplace=True)\n", | |
" res['abs'] = res.corr_value.abs()\n", | |
" return res.sort_values(by='abs', ascending=False).drop(['abs'], axis=1)\n", | |
" \n", | |
"\n", | |
"#plt.figure(figsize=(15, 12))\n", | |
"#pf.ProfileReport(train._get_numeric_data())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 5000\n", | |
"0 5000\n", | |
"Name: CLASS, dtype: int64\n" | |
] | |
} | |
], | |
"source": [ | |
"# делаем dummies\n", | |
"train_row = train.shape[0]\n", | |
"\n", | |
"to_dum = pd.concat((train.drop(['CLASS'], axis=1), test))\n", | |
"dum = pd.get_dummies(to_dum)\n", | |
"X = dum[:10000]\n", | |
"X_for_pred = dum[10000:]\n", | |
"y = train.CLASS.astype(int)\n", | |
"print pd.value_counts(y) # классы сбалансированы" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(6700, 387)" | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# делим выборку\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)\n", | |
"X_train.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# шкалируем, т.к. большой разброс\n", | |
"scalar = StandardScaler()\n", | |
"#scalar = MinMaxScaler(feature_range=(0,100))\n", | |
"X_train = scalar.fit_transform(X_train)\n", | |
"X_test = scalar.transform(X_test)\n", | |
"X_for_pred = scalar.transform(X_for_pred)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.874862758403\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0.86979627989371133" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# учимся и проверяем на КВ и тесте\n", | |
"clf = GradientBoostingClassifier(random_state=42)\n", | |
"clf.fit(X_train, y_train)\n", | |
"print cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()\n", | |
"f1_score(y_test, clf.predict(X_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.86749851455733806" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# учимся и проверяем на КВ и тесте\n", | |
"clf = xgb.XGBClassifier(n_estimators=300)\n", | |
"clf.fit(X_train, y_train)\n", | |
"print cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()\n", | |
"f1_score(y_test, clf.predict(X_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# предсказываем пишем результат\n", | |
"res = pd.DataFrame(clf.predict(X_for_pred), test.index)\n", | |
"res.index.name = 'Id'\n", | |
"res.columns = ['Prediction']\n", | |
"res.to_csv('res.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# предсказываем пишем результат вероятности\n", | |
"res = pd.DataFrame(clf.predict_proba(X_for_pred)[:,1], test.index)\n", | |
"res.index.name = 'Id'\n", | |
"res.columns = ['Prediction']\n", | |
"res.to_csv('res_proba.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Тюним модель\n", | |
"https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\Users\\Const\\Anaconda2\\envs\\py27\\lib\\site-packages\\sklearn\\externals\\joblib\\hashing.py:197: DeprecationWarning: Changing the shape of non-C contiguous array by\n", | |
"descriptor assignment is deprecated. To maintain\n", | |
"the Fortran contiguity of a multidimensional Fortran\n", | |
"array, use 'a.T.view(...).T' instead\n", | |
" obj_bytes_view = obj.view(self.np.uint8)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"([mean: 0.87232, std: 0.00848, params: {'n_estimators': 50},\n", | |
" mean: 0.87105, std: 0.00790, params: {'n_estimators': 100},\n", | |
" mean: 0.87163, std: 0.00872, params: {'n_estimators': 150},\n", | |
" mean: 0.87169, std: 0.00703, params: {'n_estimators': 200},\n", | |
" mean: 0.86967, std: 0.00687, params: {'n_estimators': 250},\n", | |
" mean: 0.86823, std: 0.00918, params: {'n_estimators': 300},\n", | |
" mean: 0.86686, std: 0.00787, params: {'n_estimators': 350},\n", | |
" mean: 0.86607, std: 0.00648, params: {'n_estimators': 400},\n", | |
" mean: 0.86589, std: 0.01034, params: {'n_estimators': 450},\n", | |
" mean: 0.86529, std: 0.00935, params: {'n_estimators': 500}],\n", | |
" {'n_estimators': 50},\n", | |
" 0.872316507636415)" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf1 = GradientBoostingClassifier(learning_rate=0.2, min_samples_split=100, min_samples_leaf=50, \n", | |
" max_depth=5, max_features='sqrt', \n", | |
" subsample=0.8, random_state=42)\n", | |
"param1 = {'n_estimators':range(50,501,50)}\n", | |
"gs1 = GridSearchCV(clf1, param1, scoring='f1', n_jobs=3, iid=False, cv=5)\n", | |
"gs1.fit(X_train, y_train)\n", | |
"gs1.grid_scores_, gs1.best_params_, gs1.best_score_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting 5 folds for each of 90 candidates, totalling 450 fits\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[Parallel(n_jobs=3)]: Done 26 tasks | elapsed: 13.0s\n", | |
"[Parallel(n_jobs=3)]: Done 122 tasks | elapsed: 1.0min\n", | |
"[Parallel(n_jobs=3)]: Done 282 tasks | elapsed: 2.9min\n", | |
"[Parallel(n_jobs=3)]: Done 450 out of 450 | elapsed: 5.4min finished\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"([mean: 0.87226, std: 0.00734, params: {'min_samples_split': 50, 'max_depth': 5},\n", | |
" mean: 0.87368, std: 0.00580, params: {'min_samples_split': 100, 'max_depth': 5},\n", | |
" mean: 0.87072, std: 0.00566, params: {'min_samples_split': 150, 'max_depth': 5},\n", | |
" mean: 0.87209, std: 0.00407, params: {'min_samples_split': 200, 'max_depth': 5},\n", | |
" mean: 0.87026, std: 0.00879, params: {'min_samples_split': 250, 'max_depth': 5},\n", | |
" mean: 0.87306, std: 0.00609, params: {'min_samples_split': 300, 'max_depth': 5},\n", | |
" mean: 0.86901, std: 0.00901, params: {'min_samples_split': 350, 'max_depth': 5},\n", | |
" mean: 0.87221, std: 0.00587, params: {'min_samples_split': 400, 'max_depth': 5},\n", | |
" mean: 0.87196, std: 0.00754, params: {'min_samples_split': 450, 'max_depth': 5},\n", | |
" mean: 0.86957, std: 0.00469, params: {'min_samples_split': 500, 'max_depth': 5},\n", | |
" mean: 0.87184, std: 0.00781, params: {'min_samples_split': 50, 'max_depth': 7},\n", | |
" mean: 0.87500, std: 0.00871, params: {'min_samples_split': 100, 'max_depth': 7},\n", | |
" mean: 0.87773, std: 0.00920, params: {'min_samples_split': 150, 'max_depth': 7},\n", | |
" mean: 0.87468, std: 0.00915, params: {'min_samples_split': 200, 'max_depth': 7},\n", | |
" mean: 0.87093, std: 0.00967, params: {'min_samples_split': 250, 'max_depth': 7},\n", | |
" mean: 0.87382, std: 0.00943, params: {'min_samples_split': 300, 'max_depth': 7},\n", | |
" mean: 0.87206, std: 0.00821, params: {'min_samples_split': 350, 'max_depth': 7},\n", | |
" mean: 0.87265, std: 0.00843, params: {'min_samples_split': 400, 'max_depth': 7},\n", | |
" mean: 0.87103, std: 0.00778, params: {'min_samples_split': 450, 'max_depth': 7},\n", | |
" mean: 0.87411, std: 0.00695, params: {'min_samples_split': 500, 'max_depth': 7},\n", | |
" mean: 0.87294, std: 0.00620, params: {'min_samples_split': 50, 'max_depth': 9},\n", | |
" mean: 0.87172, std: 0.00965, params: {'min_samples_split': 100, 'max_depth': 9},\n", | |
" mean: 0.87392, std: 0.00779, params: {'min_samples_split': 150, 'max_depth': 9},\n", | |
" mean: 0.87603, std: 0.00697, params: {'min_samples_split': 200, 'max_depth': 9},\n", | |
" mean: 0.87248, std: 0.00739, params: {'min_samples_split': 250, 'max_depth': 9},\n", | |
" mean: 0.87061, std: 0.00494, params: {'min_samples_split': 300, 'max_depth': 9},\n", | |
" mean: 0.87223, std: 0.01166, params: {'min_samples_split': 350, 'max_depth': 9},\n", | |
" mean: 0.87197, std: 0.00861, params: {'min_samples_split': 400, 'max_depth': 9},\n", | |
" mean: 0.87084, std: 0.00732, params: {'min_samples_split': 450, 'max_depth': 9},\n", | |
" mean: 0.87213, std: 0.00518, params: {'min_samples_split': 500, 'max_depth': 9},\n", | |
" mean: 0.87114, std: 0.00551, params: {'min_samples_split': 50, 'max_depth': 11},\n", | |
" mean: 0.87342, std: 0.00711, params: {'min_samples_split': 100, 'max_depth': 11},\n", | |
" mean: 0.87427, std: 0.00781, params: {'min_samples_split': 150, 'max_depth': 11},\n", | |
" mean: 0.87222, std: 0.00679, params: {'min_samples_split': 200, 'max_depth': 11},\n", | |
" mean: 0.87096, std: 0.00623, params: {'min_samples_split': 250, 'max_depth': 11},\n", | |
" mean: 0.87649, std: 0.00822, params: {'min_samples_split': 300, 'max_depth': 11},\n", | |
" mean: 0.87411, std: 0.00776, params: {'min_samples_split': 350, 'max_depth': 11},\n", | |
" mean: 0.87288, std: 0.00573, params: {'min_samples_split': 400, 'max_depth': 11},\n", | |
" mean: 0.86911, std: 0.01002, params: {'min_samples_split': 450, 'max_depth': 11},\n", | |
" mean: 0.87098, std: 0.00633, params: {'min_samples_split': 500, 'max_depth': 11},\n", | |
" mean: 0.87247, std: 0.00517, params: {'min_samples_split': 50, 'max_depth': 13},\n", | |
" mean: 0.87125, std: 0.00547, params: {'min_samples_split': 100, 'max_depth': 13},\n", | |
" mean: 0.86967, std: 0.00484, params: {'min_samples_split': 150, 'max_depth': 13},\n", | |
" mean: 0.87564, std: 0.00742, params: {'min_samples_split': 200, 'max_depth': 13},\n", | |
" mean: 0.86971, std: 0.00852, params: {'min_samples_split': 250, 'max_depth': 13},\n", | |
" mean: 0.87143, std: 0.00819, params: {'min_samples_split': 300, 'max_depth': 13},\n", | |
" mean: 0.87219, std: 0.00453, params: {'min_samples_split': 350, 'max_depth': 13},\n", | |
" mean: 0.87362, std: 0.00738, params: {'min_samples_split': 400, 'max_depth': 13},\n", | |
" mean: 0.87268, std: 0.00503, params: {'min_samples_split': 450, 'max_depth': 13},\n", | |
" mean: 0.87286, std: 0.00585, params: {'min_samples_split': 500, 'max_depth': 13},\n", | |
" mean: 0.86697, std: 0.00798, params: {'min_samples_split': 50, 'max_depth': 15},\n", | |
" mean: 0.87423, std: 0.00724, params: {'min_samples_split': 100, 'max_depth': 15},\n", | |
" mean: 0.87178, std: 0.00672, params: {'min_samples_split': 150, 'max_depth': 15},\n", | |
" mean: 0.87504, std: 0.00727, params: {'min_samples_split': 200, 'max_depth': 15},\n", | |
" mean: 0.87325, std: 0.00739, params: {'min_samples_split': 250, 'max_depth': 15},\n", | |
" mean: 0.87210, std: 0.00896, params: {'min_samples_split': 300, 'max_depth': 15},\n", | |
" mean: 0.86752, std: 0.00579, params: {'min_samples_split': 350, 'max_depth': 15},\n", | |
" mean: 0.87153, std: 0.00583, params: {'min_samples_split': 400, 'max_depth': 15},\n", | |
" mean: 0.87179, std: 0.00558, params: {'min_samples_split': 450, 'max_depth': 15},\n", | |
" mean: 0.87197, std: 0.00743, params: {'min_samples_split': 500, 'max_depth': 15},\n", | |
" mean: 0.86829, std: 0.00864, params: {'min_samples_split': 50, 'max_depth': 17},\n", | |
" mean: 0.87172, std: 0.00516, params: {'min_samples_split': 100, 'max_depth': 17},\n", | |
" mean: 0.87261, std: 0.00700, params: {'min_samples_split': 150, 'max_depth': 17},\n", | |
" mean: 0.87288, std: 0.00679, params: {'min_samples_split': 200, 'max_depth': 17},\n", | |
" mean: 0.87350, std: 0.00718, params: {'min_samples_split': 250, 'max_depth': 17},\n", | |
" mean: 0.87207, std: 0.00426, params: {'min_samples_split': 300, 'max_depth': 17},\n", | |
" mean: 0.87097, std: 0.00523, params: {'min_samples_split': 350, 'max_depth': 17},\n", | |
" mean: 0.87190, std: 0.00442, params: {'min_samples_split': 400, 'max_depth': 17},\n", | |
" mean: 0.86916, std: 0.00397, params: {'min_samples_split': 450, 'max_depth': 17},\n", | |
" mean: 0.87263, std: 0.00953, params: {'min_samples_split': 500, 'max_depth': 17},\n", | |
" mean: 0.87257, std: 0.00778, params: {'min_samples_split': 50, 'max_depth': 19},\n", | |
" mean: 0.86710, std: 0.00723, params: {'min_samples_split': 100, 'max_depth': 19},\n", | |
" mean: 0.87169, std: 0.00659, params: {'min_samples_split': 150, 'max_depth': 19},\n", | |
" mean: 0.87165, std: 0.00527, params: {'min_samples_split': 200, 'max_depth': 19},\n", | |
" mean: 0.87070, std: 0.00739, params: {'min_samples_split': 250, 'max_depth': 19},\n", | |
" mean: 0.87010, std: 0.00776, params: {'min_samples_split': 300, 'max_depth': 19},\n", | |
" mean: 0.87046, std: 0.00889, params: {'min_samples_split': 350, 'max_depth': 19},\n", | |
" mean: 0.87589, std: 0.00640, params: {'min_samples_split': 400, 'max_depth': 19},\n", | |
" mean: 0.87534, std: 0.00682, params: {'min_samples_split': 450, 'max_depth': 19},\n", | |
" mean: 0.87113, std: 0.00759, params: {'min_samples_split': 500, 'max_depth': 19},\n", | |
" mean: 0.86669, std: 0.00739, params: {'min_samples_split': 50, 'max_depth': 21},\n", | |
" mean: 0.86406, std: 0.00713, params: {'min_samples_split': 100, 'max_depth': 21},\n", | |
" mean: 0.87070, std: 0.00878, params: {'min_samples_split': 150, 'max_depth': 21},\n", | |
" mean: 0.86683, std: 0.00823, params: {'min_samples_split': 200, 'max_depth': 21},\n", | |
" mean: 0.86897, std: 0.00759, params: {'min_samples_split': 250, 'max_depth': 21},\n", | |
" mean: 0.87230, std: 0.00864, params: {'min_samples_split': 300, 'max_depth': 21},\n", | |
" mean: 0.87100, std: 0.00875, params: {'min_samples_split': 350, 'max_depth': 21},\n", | |
" mean: 0.87194, std: 0.00491, params: {'min_samples_split': 400, 'max_depth': 21},\n", | |
" mean: 0.87384, std: 0.00674, params: {'min_samples_split': 450, 'max_depth': 21},\n", | |
" mean: 0.87367, std: 0.00521, params: {'min_samples_split': 500, 'max_depth': 21}],\n", | |
" {'max_depth': 7, 'min_samples_split': 150},\n", | |
" 0.87772979423822073)" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf2 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, max_features='sqrt',\n", | |
" subsample=0.8, random_state=42)\n", | |
"param2 = {'max_depth':range(5,22,2), 'min_samples_split':range(50,501,50)}\n", | |
"gs2 = GridSearchCV(clf2, param2, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n", | |
"gs2.fit(X_train, y_train)\n", | |
"gs2.grid_scores_, gs2.best_params_, gs2.best_score_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting 5 folds for each of 5 candidates, totalling 25 fits\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[Parallel(n_jobs=3)]: Done 25 out of 25 | elapsed: 33.2s finished\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"([mean: 0.86712, std: 0.00709, params: {'max_depth': 22},\n", | |
" mean: 0.87124, std: 0.00496, params: {'max_depth': 24},\n", | |
" mean: 0.86805, std: 0.00541, params: {'max_depth': 26},\n", | |
" mean: 0.86783, std: 0.00566, params: {'max_depth': 28},\n", | |
" mean: 0.86946, std: 0.00885, params: {'max_depth': 30}],\n", | |
" {'max_depth': 24},\n", | |
" 0.87123804937572125)" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf3 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, max_features='sqrt',\n", | |
" subsample=0.8, min_samples_split = 150,\n", | |
" random_state=42)\n", | |
"param3 = {'max_depth':range(22,32,2)}\n", | |
"gs3 = GridSearchCV(clf3, param3, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n", | |
"gs3.fit(X_train, y_train)\n", | |
"gs3.grid_scores_, gs3.best_params_, gs3.best_score_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting 5 folds for each of 14 candidates, totalling 70 fits\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[Parallel(n_jobs=3)]: Done 26 tasks | elapsed: 24.1s\n", | |
"[Parallel(n_jobs=3)]: Done 70 out of 70 | elapsed: 1.7min finished\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"([mean: 0.87324, std: 0.00970, params: {'max_features': 20},\n", | |
" mean: 0.87138, std: 0.00937, params: {'max_features': 30},\n", | |
" mean: 0.87300, std: 0.00846, params: {'max_features': 40},\n", | |
" mean: 0.87436, std: 0.00911, params: {'max_features': 50},\n", | |
" mean: 0.87333, std: 0.00587, params: {'max_features': 60},\n", | |
" mean: 0.87347, std: 0.00802, params: {'max_features': 70},\n", | |
" mean: 0.87266, std: 0.00968, params: {'max_features': 80},\n", | |
" mean: 0.87344, std: 0.00663, params: {'max_features': 90},\n", | |
" mean: 0.87243, std: 0.00839, params: {'max_features': 100},\n", | |
" mean: 0.87473, std: 0.00959, params: {'max_features': 110},\n", | |
" mean: 0.87240, std: 0.01106, params: {'max_features': 120},\n", | |
" mean: 0.87260, std: 0.00897, params: {'max_features': 130},\n", | |
" mean: 0.87133, std: 0.00653, params: {'max_features': 140},\n", | |
" mean: 0.87388, std: 0.00910, params: {'max_features': 150}],\n", | |
" {'max_features': 110},\n", | |
" 0.87472979884016444)" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf4 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, subsample=0.8, \n", | |
" min_samples_split = 150,\n", | |
" max_depth=7, random_state=42)\n", | |
"param4 = {'max_features':range(20,151,10)}\n", | |
"gs4 = GridSearchCV(clf4, param4, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n", | |
"gs4.fit(X_train, y_train)\n", | |
"gs4.grid_scores_, gs4.best_params_, gs4.best_score_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting 5 folds for each of 5 candidates, totalling 25 fits\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[Parallel(n_jobs=3)]: Done 25 out of 25 | elapsed: 1.8min finished\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"([mean: 0.87026, std: 0.00831, params: {'subsample': 0.7},\n", | |
" mean: 0.86900, std: 0.00819, params: {'subsample': 0.75},\n", | |
" mean: 0.87097, std: 0.00872, params: {'subsample': 0.8},\n", | |
" mean: 0.87134, std: 0.00950, params: {'subsample': 0.85},\n", | |
" mean: 0.86609, std: 0.01153, params: {'subsample': 0.9}],\n", | |
" {'subsample': 0.85},\n", | |
" 0.87134009606077412)" | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf5 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, max_features=110,\n", | |
" min_samples_split = 150,\n", | |
" max_depth=21, random_state=42)\n", | |
"param5 = {'subsample':[0.7,0.75,0.8,0.85,0.9]}\n", | |
"gs5 = GridSearchCV(clf5, param5, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n", | |
"gs5.fit(X_train, y_train)\n", | |
"gs5.grid_scores_, gs5.best_params_, gs5.best_score_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wall time: 5min 1s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0.86910062333036509" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf_tun1 = GradientBoostingClassifier(learning_rate=0.2/128, n_estimators=50*128,\n", | |
" #max_features=110, \n", | |
" max_features='sqrt', \n", | |
" min_samples_split = 150, subsample=0.85,\n", | |
" max_depth=21, random_state=42, warm_start=True)\n", | |
"%time clf_tun1.fit(X_train, y_train)\n", | |
"#print cross_val_score(clf_tun1, X_train, y_train, n_jobs=3, cv=5, scoring='f1', verbose=3).mean()\n", | |
"f1_score(y_test, clf_tun1.predict(X_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# предсказываем и пишем результат\n", | |
"res = pd.DataFrame(clf_tun1.predict(X_for_pred), test.index)\n", | |
"res.index.name = 'Id'\n", | |
"res.columns = ['Prediction']\n", | |
"res.to_csv('res.csv')\n", | |
"\n", | |
"# предсказываем и пишем результат вероятности\n", | |
"res = pd.DataFrame(clf_tun1.predict_proba(X_for_pred)[:,1], test.index)\n", | |
"res.index.name = 'Id'\n", | |
"res.columns = ['Prediction']\n", | |
"res.to_csv('res_proba.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [py27]", | |
"language": "python", | |
"name": "Python [py27]" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment