Skip to content

Instantly share code, notes, and snippets.

@ivaninkv
Last active August 24, 2016 19:04
Show Gist options
  • Save ivaninkv/cf817b1f25d57a5006d4761113b5d231 to your computer and use it in GitHub Desktop.
Save ivaninkv/cf817b1f25d57a5006d4761113b5d231 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# импортируем библиотеки\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler, PolynomialFeatures\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.metrics import f1_score\n",
"from sklearn.cross_validation import train_test_split, cross_val_score\n",
"from sklearn.grid_search import GridSearchCV\n",
"import xgboost as xgb\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import pandas_profiling as pf\n",
"import warnings\n",
"warnings.filterwarnings('once') "
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"\"\\nto_drop_cat = ['AHSCOL', 'AREORGN', 'AUNTYPE', 'PENATVTY', 'PRCITSHP', 'TAXINC']\\nto_drop_num = ['AHRSPAY', 'CAPGAIN', 'CAPLOSS']\\ntrain.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\\ntest.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\\n\""
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# грузим\n",
"train = pd.read_csv('train.csv')\n",
"train.ix[train.CLASS == '50000+', 'CLASS'] = 1\n",
"train.ix[train.CLASS == '-50000', 'CLASS'] = 0\n",
"\n",
"test = pd.read_csv('test.csv', index_col='Id')\n",
"\n",
"# дропаем столбец AWKSTAT в нем 1 значение в трейне и в тесте\n",
"train.drop(['AWKSTAT', 'WKSWORK'], inplace=True, axis=1)\n",
"test.drop(['AWKSTAT', 'WKSWORK'], inplace=True, axis=1)\n",
"\n",
"train.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']] = train.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']].applymap(str)\n",
"test.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']] = test.ix[:, ['VETQVA', 'SEOTR', 'NOEMP']].applymap(str)\n",
"\n",
"to_str = test.drop(train._get_numeric_data().columns, axis=1).columns\n",
"train.ix[:, to_str] = train.ix[:, to_str].applymap(str)\n",
"test.ix[:, to_str] = test.ix[:, to_str].applymap(str)\n",
"\n",
"'''\n",
"to_drop_cat = ['AHSCOL', 'AREORGN', 'AUNTYPE', 'PENATVTY', 'PRCITSHP', 'TAXINC']\n",
"to_drop_num = ['AHRSPAY', 'CAPGAIN', 'CAPLOSS']\n",
"train.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\n",
"test.drop(to_drop_cat + to_drop_num, inplace=True, axis=1)\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n",
"False\n"
]
}
],
"source": [
"# NaN\n",
"print train.isnull().any().any()\n",
"print test.isnull().any().any()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# исследуем данные\n",
"def get_uniq(df, only_num = True): \n",
" if only_num == True:\n",
" df = df._get_numeric_data()\n",
" d = {}\n",
" for i in df:\n",
" d[i] = df[i].nunique() \n",
" res = pd.DataFrame(d.items()) \n",
" res.columns = ['fieldname', 'uniq_count']\n",
" res.index = res.ix[:, 'fieldname']\n",
" res.drop(['fieldname'], axis=1, inplace=True)\n",
" res['type'] = df.dtypes\n",
" return res.sort_values(by='uniq_count') \n",
"\n",
"def get_corr(df):\n",
" d = {}\n",
" df_num = df._get_numeric_data()\n",
" for i in df_num:\n",
" res = df_num.drop(i, axis=1).corrwith(df_num[i])\n",
" field_name = np.argmax(np.abs(res))\n",
" if np.max(np.abs(res)) == np.max(res):\n",
" corr_value = np.max(res)\n",
" else:\n",
" corr_value = np.min(res)\n",
" d[i + ' - ' + field_name] = corr_value\n",
" res = pd.DataFrame(d.items())\n",
" res.columns = ['pairs_name', 'corr_value']\n",
" res.index = res.ix[:, 'pairs_name']\n",
" res.drop(['pairs_name'], axis=1, inplace=True)\n",
" res['abs'] = res.corr_value.abs()\n",
" return res.sort_values(by='abs', ascending=False).drop(['abs'], axis=1)\n",
" \n",
"\n",
"#plt.figure(figsize=(15, 12))\n",
"#pf.ProfileReport(train._get_numeric_data())"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 5000\n",
"0 5000\n",
"Name: CLASS, dtype: int64\n"
]
}
],
"source": [
"# делаем dummies\n",
"train_row = train.shape[0]\n",
"\n",
"to_dum = pd.concat((train.drop(['CLASS'], axis=1), test))\n",
"dum = pd.get_dummies(to_dum)\n",
"X = dum[:10000]\n",
"X_for_pred = dum[10000:]\n",
"y = train.CLASS.astype(int)\n",
"print pd.value_counts(y) # классы сбалансированы"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(6700, 387)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# делим выборку\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)\n",
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# шкалируем, т.к. большой разброс\n",
"scalar = StandardScaler()\n",
"#scalar = MinMaxScaler(feature_range=(0,100))\n",
"X_train = scalar.fit_transform(X_train)\n",
"X_test = scalar.transform(X_test)\n",
"X_for_pred = scalar.transform(X_for_pred)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.874862758403\n"
]
},
{
"data": {
"text/plain": [
"0.86979627989371133"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# учимся и проверяем на КВ и тесте\n",
"clf = GradientBoostingClassifier(random_state=42)\n",
"clf.fit(X_train, y_train)\n",
"print cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()\n",
"f1_score(y_test, clf.predict(X_test))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.86749851455733806"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# учимся и проверяем на КВ и тесте\n",
"clf = xgb.XGBClassifier(n_estimators=300)\n",
"clf.fit(X_train, y_train)\n",
"print cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()\n",
"f1_score(y_test, clf.predict(X_test))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# предсказываем пишем результат\n",
"res = pd.DataFrame(clf.predict(X_for_pred), test.index)\n",
"res.index.name = 'Id'\n",
"res.columns = ['Prediction']\n",
"res.to_csv('res.csv')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# предсказываем пишем результат вероятности\n",
"res = pd.DataFrame(clf.predict_proba(X_for_pred)[:,1], test.index)\n",
"res.index.name = 'Id'\n",
"res.columns = ['Prediction']\n",
"res.to_csv('res_proba.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Тюним модель\n",
"https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Const\\Anaconda2\\envs\\py27\\lib\\site-packages\\sklearn\\externals\\joblib\\hashing.py:197: DeprecationWarning: Changing the shape of non-C contiguous array by\n",
"descriptor assignment is deprecated. To maintain\n",
"the Fortran contiguity of a multidimensional Fortran\n",
"array, use 'a.T.view(...).T' instead\n",
" obj_bytes_view = obj.view(self.np.uint8)\n"
]
},
{
"data": {
"text/plain": [
"([mean: 0.87232, std: 0.00848, params: {'n_estimators': 50},\n",
" mean: 0.87105, std: 0.00790, params: {'n_estimators': 100},\n",
" mean: 0.87163, std: 0.00872, params: {'n_estimators': 150},\n",
" mean: 0.87169, std: 0.00703, params: {'n_estimators': 200},\n",
" mean: 0.86967, std: 0.00687, params: {'n_estimators': 250},\n",
" mean: 0.86823, std: 0.00918, params: {'n_estimators': 300},\n",
" mean: 0.86686, std: 0.00787, params: {'n_estimators': 350},\n",
" mean: 0.86607, std: 0.00648, params: {'n_estimators': 400},\n",
" mean: 0.86589, std: 0.01034, params: {'n_estimators': 450},\n",
" mean: 0.86529, std: 0.00935, params: {'n_estimators': 500}],\n",
" {'n_estimators': 50},\n",
" 0.872316507636415)"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf1 = GradientBoostingClassifier(learning_rate=0.2, min_samples_split=100, min_samples_leaf=50, \n",
" max_depth=5, max_features='sqrt', \n",
" subsample=0.8, random_state=42)\n",
"param1 = {'n_estimators':range(50,501,50)}\n",
"gs1 = GridSearchCV(clf1, param1, scoring='f1', n_jobs=3, iid=False, cv=5)\n",
"gs1.fit(X_train, y_train)\n",
"gs1.grid_scores_, gs1.best_params_, gs1.best_score_"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 90 candidates, totalling 450 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=3)]: Done 26 tasks | elapsed: 13.0s\n",
"[Parallel(n_jobs=3)]: Done 122 tasks | elapsed: 1.0min\n",
"[Parallel(n_jobs=3)]: Done 282 tasks | elapsed: 2.9min\n",
"[Parallel(n_jobs=3)]: Done 450 out of 450 | elapsed: 5.4min finished\n"
]
},
{
"data": {
"text/plain": [
"([mean: 0.87226, std: 0.00734, params: {'min_samples_split': 50, 'max_depth': 5},\n",
" mean: 0.87368, std: 0.00580, params: {'min_samples_split': 100, 'max_depth': 5},\n",
" mean: 0.87072, std: 0.00566, params: {'min_samples_split': 150, 'max_depth': 5},\n",
" mean: 0.87209, std: 0.00407, params: {'min_samples_split': 200, 'max_depth': 5},\n",
" mean: 0.87026, std: 0.00879, params: {'min_samples_split': 250, 'max_depth': 5},\n",
" mean: 0.87306, std: 0.00609, params: {'min_samples_split': 300, 'max_depth': 5},\n",
" mean: 0.86901, std: 0.00901, params: {'min_samples_split': 350, 'max_depth': 5},\n",
" mean: 0.87221, std: 0.00587, params: {'min_samples_split': 400, 'max_depth': 5},\n",
" mean: 0.87196, std: 0.00754, params: {'min_samples_split': 450, 'max_depth': 5},\n",
" mean: 0.86957, std: 0.00469, params: {'min_samples_split': 500, 'max_depth': 5},\n",
" mean: 0.87184, std: 0.00781, params: {'min_samples_split': 50, 'max_depth': 7},\n",
" mean: 0.87500, std: 0.00871, params: {'min_samples_split': 100, 'max_depth': 7},\n",
" mean: 0.87773, std: 0.00920, params: {'min_samples_split': 150, 'max_depth': 7},\n",
" mean: 0.87468, std: 0.00915, params: {'min_samples_split': 200, 'max_depth': 7},\n",
" mean: 0.87093, std: 0.00967, params: {'min_samples_split': 250, 'max_depth': 7},\n",
" mean: 0.87382, std: 0.00943, params: {'min_samples_split': 300, 'max_depth': 7},\n",
" mean: 0.87206, std: 0.00821, params: {'min_samples_split': 350, 'max_depth': 7},\n",
" mean: 0.87265, std: 0.00843, params: {'min_samples_split': 400, 'max_depth': 7},\n",
" mean: 0.87103, std: 0.00778, params: {'min_samples_split': 450, 'max_depth': 7},\n",
" mean: 0.87411, std: 0.00695, params: {'min_samples_split': 500, 'max_depth': 7},\n",
" mean: 0.87294, std: 0.00620, params: {'min_samples_split': 50, 'max_depth': 9},\n",
" mean: 0.87172, std: 0.00965, params: {'min_samples_split': 100, 'max_depth': 9},\n",
" mean: 0.87392, std: 0.00779, params: {'min_samples_split': 150, 'max_depth': 9},\n",
" mean: 0.87603, std: 0.00697, params: {'min_samples_split': 200, 'max_depth': 9},\n",
" mean: 0.87248, std: 0.00739, params: {'min_samples_split': 250, 'max_depth': 9},\n",
" mean: 0.87061, std: 0.00494, params: {'min_samples_split': 300, 'max_depth': 9},\n",
" mean: 0.87223, std: 0.01166, params: {'min_samples_split': 350, 'max_depth': 9},\n",
" mean: 0.87197, std: 0.00861, params: {'min_samples_split': 400, 'max_depth': 9},\n",
" mean: 0.87084, std: 0.00732, params: {'min_samples_split': 450, 'max_depth': 9},\n",
" mean: 0.87213, std: 0.00518, params: {'min_samples_split': 500, 'max_depth': 9},\n",
" mean: 0.87114, std: 0.00551, params: {'min_samples_split': 50, 'max_depth': 11},\n",
" mean: 0.87342, std: 0.00711, params: {'min_samples_split': 100, 'max_depth': 11},\n",
" mean: 0.87427, std: 0.00781, params: {'min_samples_split': 150, 'max_depth': 11},\n",
" mean: 0.87222, std: 0.00679, params: {'min_samples_split': 200, 'max_depth': 11},\n",
" mean: 0.87096, std: 0.00623, params: {'min_samples_split': 250, 'max_depth': 11},\n",
" mean: 0.87649, std: 0.00822, params: {'min_samples_split': 300, 'max_depth': 11},\n",
" mean: 0.87411, std: 0.00776, params: {'min_samples_split': 350, 'max_depth': 11},\n",
" mean: 0.87288, std: 0.00573, params: {'min_samples_split': 400, 'max_depth': 11},\n",
" mean: 0.86911, std: 0.01002, params: {'min_samples_split': 450, 'max_depth': 11},\n",
" mean: 0.87098, std: 0.00633, params: {'min_samples_split': 500, 'max_depth': 11},\n",
" mean: 0.87247, std: 0.00517, params: {'min_samples_split': 50, 'max_depth': 13},\n",
" mean: 0.87125, std: 0.00547, params: {'min_samples_split': 100, 'max_depth': 13},\n",
" mean: 0.86967, std: 0.00484, params: {'min_samples_split': 150, 'max_depth': 13},\n",
" mean: 0.87564, std: 0.00742, params: {'min_samples_split': 200, 'max_depth': 13},\n",
" mean: 0.86971, std: 0.00852, params: {'min_samples_split': 250, 'max_depth': 13},\n",
" mean: 0.87143, std: 0.00819, params: {'min_samples_split': 300, 'max_depth': 13},\n",
" mean: 0.87219, std: 0.00453, params: {'min_samples_split': 350, 'max_depth': 13},\n",
" mean: 0.87362, std: 0.00738, params: {'min_samples_split': 400, 'max_depth': 13},\n",
" mean: 0.87268, std: 0.00503, params: {'min_samples_split': 450, 'max_depth': 13},\n",
" mean: 0.87286, std: 0.00585, params: {'min_samples_split': 500, 'max_depth': 13},\n",
" mean: 0.86697, std: 0.00798, params: {'min_samples_split': 50, 'max_depth': 15},\n",
" mean: 0.87423, std: 0.00724, params: {'min_samples_split': 100, 'max_depth': 15},\n",
" mean: 0.87178, std: 0.00672, params: {'min_samples_split': 150, 'max_depth': 15},\n",
" mean: 0.87504, std: 0.00727, params: {'min_samples_split': 200, 'max_depth': 15},\n",
" mean: 0.87325, std: 0.00739, params: {'min_samples_split': 250, 'max_depth': 15},\n",
" mean: 0.87210, std: 0.00896, params: {'min_samples_split': 300, 'max_depth': 15},\n",
" mean: 0.86752, std: 0.00579, params: {'min_samples_split': 350, 'max_depth': 15},\n",
" mean: 0.87153, std: 0.00583, params: {'min_samples_split': 400, 'max_depth': 15},\n",
" mean: 0.87179, std: 0.00558, params: {'min_samples_split': 450, 'max_depth': 15},\n",
" mean: 0.87197, std: 0.00743, params: {'min_samples_split': 500, 'max_depth': 15},\n",
" mean: 0.86829, std: 0.00864, params: {'min_samples_split': 50, 'max_depth': 17},\n",
" mean: 0.87172, std: 0.00516, params: {'min_samples_split': 100, 'max_depth': 17},\n",
" mean: 0.87261, std: 0.00700, params: {'min_samples_split': 150, 'max_depth': 17},\n",
" mean: 0.87288, std: 0.00679, params: {'min_samples_split': 200, 'max_depth': 17},\n",
" mean: 0.87350, std: 0.00718, params: {'min_samples_split': 250, 'max_depth': 17},\n",
" mean: 0.87207, std: 0.00426, params: {'min_samples_split': 300, 'max_depth': 17},\n",
" mean: 0.87097, std: 0.00523, params: {'min_samples_split': 350, 'max_depth': 17},\n",
" mean: 0.87190, std: 0.00442, params: {'min_samples_split': 400, 'max_depth': 17},\n",
" mean: 0.86916, std: 0.00397, params: {'min_samples_split': 450, 'max_depth': 17},\n",
" mean: 0.87263, std: 0.00953, params: {'min_samples_split': 500, 'max_depth': 17},\n",
" mean: 0.87257, std: 0.00778, params: {'min_samples_split': 50, 'max_depth': 19},\n",
" mean: 0.86710, std: 0.00723, params: {'min_samples_split': 100, 'max_depth': 19},\n",
" mean: 0.87169, std: 0.00659, params: {'min_samples_split': 150, 'max_depth': 19},\n",
" mean: 0.87165, std: 0.00527, params: {'min_samples_split': 200, 'max_depth': 19},\n",
" mean: 0.87070, std: 0.00739, params: {'min_samples_split': 250, 'max_depth': 19},\n",
" mean: 0.87010, std: 0.00776, params: {'min_samples_split': 300, 'max_depth': 19},\n",
" mean: 0.87046, std: 0.00889, params: {'min_samples_split': 350, 'max_depth': 19},\n",
" mean: 0.87589, std: 0.00640, params: {'min_samples_split': 400, 'max_depth': 19},\n",
" mean: 0.87534, std: 0.00682, params: {'min_samples_split': 450, 'max_depth': 19},\n",
" mean: 0.87113, std: 0.00759, params: {'min_samples_split': 500, 'max_depth': 19},\n",
" mean: 0.86669, std: 0.00739, params: {'min_samples_split': 50, 'max_depth': 21},\n",
" mean: 0.86406, std: 0.00713, params: {'min_samples_split': 100, 'max_depth': 21},\n",
" mean: 0.87070, std: 0.00878, params: {'min_samples_split': 150, 'max_depth': 21},\n",
" mean: 0.86683, std: 0.00823, params: {'min_samples_split': 200, 'max_depth': 21},\n",
" mean: 0.86897, std: 0.00759, params: {'min_samples_split': 250, 'max_depth': 21},\n",
" mean: 0.87230, std: 0.00864, params: {'min_samples_split': 300, 'max_depth': 21},\n",
" mean: 0.87100, std: 0.00875, params: {'min_samples_split': 350, 'max_depth': 21},\n",
" mean: 0.87194, std: 0.00491, params: {'min_samples_split': 400, 'max_depth': 21},\n",
" mean: 0.87384, std: 0.00674, params: {'min_samples_split': 450, 'max_depth': 21},\n",
" mean: 0.87367, std: 0.00521, params: {'min_samples_split': 500, 'max_depth': 21}],\n",
" {'max_depth': 7, 'min_samples_split': 150},\n",
" 0.87772979423822073)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf2 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, max_features='sqrt',\n",
" subsample=0.8, random_state=42)\n",
"param2 = {'max_depth':range(5,22,2), 'min_samples_split':range(50,501,50)}\n",
"gs2 = GridSearchCV(clf2, param2, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n",
"gs2.fit(X_train, y_train)\n",
"gs2.grid_scores_, gs2.best_params_, gs2.best_score_"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 5 candidates, totalling 25 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=3)]: Done 25 out of 25 | elapsed: 33.2s finished\n"
]
},
{
"data": {
"text/plain": [
"([mean: 0.86712, std: 0.00709, params: {'max_depth': 22},\n",
" mean: 0.87124, std: 0.00496, params: {'max_depth': 24},\n",
" mean: 0.86805, std: 0.00541, params: {'max_depth': 26},\n",
" mean: 0.86783, std: 0.00566, params: {'max_depth': 28},\n",
" mean: 0.86946, std: 0.00885, params: {'max_depth': 30}],\n",
" {'max_depth': 24},\n",
" 0.87123804937572125)"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf3 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, max_features='sqrt',\n",
" subsample=0.8, min_samples_split = 150,\n",
" random_state=42)\n",
"param3 = {'max_depth':range(22,32,2)}\n",
"gs3 = GridSearchCV(clf3, param3, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n",
"gs3.fit(X_train, y_train)\n",
"gs3.grid_scores_, gs3.best_params_, gs3.best_score_"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 14 candidates, totalling 70 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=3)]: Done 26 tasks | elapsed: 24.1s\n",
"[Parallel(n_jobs=3)]: Done 70 out of 70 | elapsed: 1.7min finished\n"
]
},
{
"data": {
"text/plain": [
"([mean: 0.87324, std: 0.00970, params: {'max_features': 20},\n",
" mean: 0.87138, std: 0.00937, params: {'max_features': 30},\n",
" mean: 0.87300, std: 0.00846, params: {'max_features': 40},\n",
" mean: 0.87436, std: 0.00911, params: {'max_features': 50},\n",
" mean: 0.87333, std: 0.00587, params: {'max_features': 60},\n",
" mean: 0.87347, std: 0.00802, params: {'max_features': 70},\n",
" mean: 0.87266, std: 0.00968, params: {'max_features': 80},\n",
" mean: 0.87344, std: 0.00663, params: {'max_features': 90},\n",
" mean: 0.87243, std: 0.00839, params: {'max_features': 100},\n",
" mean: 0.87473, std: 0.00959, params: {'max_features': 110},\n",
" mean: 0.87240, std: 0.01106, params: {'max_features': 120},\n",
" mean: 0.87260, std: 0.00897, params: {'max_features': 130},\n",
" mean: 0.87133, std: 0.00653, params: {'max_features': 140},\n",
" mean: 0.87388, std: 0.00910, params: {'max_features': 150}],\n",
" {'max_features': 110},\n",
" 0.87472979884016444)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf4 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, subsample=0.8, \n",
" min_samples_split = 150,\n",
" max_depth=7, random_state=42)\n",
"param4 = {'max_features':range(20,151,10)}\n",
"gs4 = GridSearchCV(clf4, param4, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n",
"gs4.fit(X_train, y_train)\n",
"gs4.grid_scores_, gs4.best_params_, gs4.best_score_"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 5 candidates, totalling 25 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=3)]: Done 25 out of 25 | elapsed: 1.8min finished\n"
]
},
{
"data": {
"text/plain": [
"([mean: 0.87026, std: 0.00831, params: {'subsample': 0.7},\n",
" mean: 0.86900, std: 0.00819, params: {'subsample': 0.75},\n",
" mean: 0.87097, std: 0.00872, params: {'subsample': 0.8},\n",
" mean: 0.87134, std: 0.00950, params: {'subsample': 0.85},\n",
" mean: 0.86609, std: 0.01153, params: {'subsample': 0.9}],\n",
" {'subsample': 0.85},\n",
" 0.87134009606077412)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf5 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, max_features=110,\n",
" min_samples_split = 150,\n",
" max_depth=21, random_state=42)\n",
"param5 = {'subsample':[0.7,0.75,0.8,0.85,0.9]}\n",
"gs5 = GridSearchCV(clf5, param5, scoring='f1', n_jobs=3, iid=False, cv=5, verbose=3)\n",
"gs5.fit(X_train, y_train)\n",
"gs5.grid_scores_, gs5.best_params_, gs5.best_score_"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 5min 1s\n"
]
},
{
"data": {
"text/plain": [
"0.86910062333036509"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf_tun1 = GradientBoostingClassifier(learning_rate=0.2/128, n_estimators=50*128,\n",
" #max_features=110, \n",
" max_features='sqrt', \n",
" min_samples_split = 150, subsample=0.85,\n",
" max_depth=21, random_state=42, warm_start=True)\n",
"%time clf_tun1.fit(X_train, y_train)\n",
"#print cross_val_score(clf_tun1, X_train, y_train, n_jobs=3, cv=5, scoring='f1', verbose=3).mean()\n",
"f1_score(y_test, clf_tun1.predict(X_test))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# предсказываем и пишем результат\n",
"res = pd.DataFrame(clf_tun1.predict(X_for_pred), test.index)\n",
"res.index.name = 'Id'\n",
"res.columns = ['Prediction']\n",
"res.to_csv('res.csv')\n",
"\n",
"# предсказываем и пишем результат вероятности\n",
"res = pd.DataFrame(clf_tun1.predict_proba(X_for_pred)[:,1], test.index)\n",
"res.index.name = 'Id'\n",
"res.columns = ['Prediction']\n",
"res.to_csv('res_proba.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [py27]",
"language": "python",
"name": "Python [py27]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment