Created
December 15, 2018 12:24
-
-
Save ikedaosushi/a30e28c9abe6e353ba3866fac43b2708 to your computer and use it in GitHub Desktop.
Stacking/Blendingをheamyで、Stacknetをpystacknetで高速に実装する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.ensemble import RandomForestRegressor\n", | |
"from sklearn.linear_model import LinearRegression\n", | |
"from sklearn.neighbors import KNeighborsRegressor\n", | |
"from sklearn.neighbors import KNeighborsRegressor\n", | |
"from sklearn.model_selection import cross_val_score\n", | |
"from sklearn.datasets import load_boston\n", | |
"from sklearn.model_selection import KFold\n", | |
"from sklearn.model_selection import cross_val_score\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.metrics import mean_absolute_error\n", | |
"from sklearn.metrics import accuracy_score\n", | |
"from sklearn.metrics import mean_absolute_error\n", | |
"from catboost import CatBoostRegressor\n", | |
"from sklearn.ensemble import ExtraTreesRegressor\n", | |
"from sklearn.linear_model import Ridge\n", | |
"from xgboost import XGBRegressor\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"seed = 60" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Dataload" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X, y = load_boston(return_X_y=True)\n", | |
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=seed)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Baseline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"models_dic = {\n", | |
" 'random forest': RandomForestRegressor(n_estimators=50, random_state=seed),\n", | |
" 'linear regression': LinearRegression(normalize=True),\n", | |
" 'knn': KNeighborsRegressor(),\n", | |
" 'catboost': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')\n", | |
"}\n", | |
"\n", | |
"for name, model in models_dic.items():\n", | |
" kfold = KFold(n_splits=10, random_state=seed)\n", | |
" cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=\"neg_mean_absolute_error\")\n", | |
" print(f'{name} : {-np.mean(cv_results):.2f}')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# heamy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from heamy.dataset import Dataset\n", | |
"from heamy.estimator import Regressor, Classifier\n", | |
"from heamy.pipeline import ModelsPipeline" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### stacking" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"# datasetを準備\n", | |
"dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる\n", | |
"\n", | |
"# アンサンブルに使うモデルを定義\n", | |
"models = [\n", | |
" Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),\n", | |
" Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),\n", | |
" Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),\n", | |
" Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')\n", | |
"]\n", | |
"\n", | |
"# pipelineを定義、2nd levelデータセットの作成\n", | |
"pipeline = ModelsPipeline(*models)\n", | |
"stack_ds = pipeline.stack(k=10, seed=seed)\n", | |
"\n", | |
"# modelを作ってvalidation\n", | |
"stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)\n", | |
"y_trues, y_preds = stacker.validate(k=10)\n", | |
"\n", | |
"# 精度出力\n", | |
"cv_results = []\n", | |
"for y_true, y_pred in zip(y_trues, y_preds):\n", | |
" cv_result = mean_absolute_error(y_true, y_pred)\n", | |
" cv_results.append(cv_result)\n", | |
"print(f'stacking: {np.mean(cv_results):.2f}')\n", | |
"\n", | |
"# X_testを使ってpredict\n", | |
"y_pred = stacker.predict()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Blending" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"# datasetを準備\n", | |
"dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる\n", | |
"\n", | |
"# アンサンブルに使うモデルを定義\n", | |
"models = [\n", | |
" Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),\n", | |
" Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),\n", | |
" Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),\n", | |
" Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')\n", | |
"]\n", | |
"\n", | |
"# pipelineを定義、2nd levelデータセットの作成\n", | |
"pipeline = ModelsPipeline(*models)\n", | |
"stack_ds = pipeline.blend(proportion=0.2, seed=seed)\n", | |
"\n", | |
"# modelを作ってvalidation\n", | |
"stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)\n", | |
"y_trues, y_preds = stacker.validate(k=10)\n", | |
"\n", | |
"# 精度出力\n", | |
"cv_results = []\n", | |
"for y_true, y_pred in zip(y_trues, y_preds):\n", | |
" cv_result = mean_absolute_error(y_true, y_pred)\n", | |
" cv_results.append(cv_result)\n", | |
"print(f'blending: {np.mean(cv_results):.2f}')\n", | |
"\n", | |
"# X_testを使ってpredict\n", | |
"y_pred = stacker.predict()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Weighted Average" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"# datasetを準備\n", | |
"dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる\n", | |
"\n", | |
"# アンサンブルに使うモデルを定義\n", | |
"models = [\n", | |
" Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),\n", | |
" Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),\n", | |
" Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),\n", | |
" Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')\n", | |
"]\n", | |
"\n", | |
"# pipelineを定義\n", | |
"pipeline = ModelsPipeline(*models)\n", | |
"\n", | |
"# 最適な重みを探索\n", | |
"weights = pipeline.find_weights(mean_absolute_error)\n", | |
"pipeline_apply = pipeline.weight(weights)\n", | |
"\n", | |
"# 精度を出力\n", | |
"cv_results = pipeline_apply.validate(scorer=mean_absolute_error, k=10)\n", | |
"print(f'weighted average: {np.mean(cv_results):.2f}')\n", | |
"\n", | |
"# X_testを使ってpredict\n", | |
"y_pred = pipeline_apply.execute()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# pystacknet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"from pystacknet.pystacknet import StackNetRegressor\n", | |
"\n", | |
"# アンサンブルに使うモデルを定義\n", | |
"models=[ \n", | |
" # 1st level\n", | |
" [\n", | |
" RandomForestRegressor(n_estimators=50, random_state=seed),\n", | |
" LinearRegression(normalize=True),\n", | |
" CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent') \n", | |
" ],\n", | |
" # 2nd level\n", | |
" [\n", | |
" Ridge(normalize=True),\n", | |
" ExtraTreesRegressor(random_state=seed),\n", | |
" XGBRegressor(random_state=seed)\n", | |
" ],\n", | |
" [\n", | |
" LinearRegression(normalize=True)\n", | |
" ]\n", | |
"]\n", | |
"\n", | |
"# StackNetモデルを作成\n", | |
"model = StackNetRegressor(\n", | |
" models, folds=10,\n", | |
" restacking=False, use_retraining=True,\n", | |
" random_state=seed, n_jobs=-1\n", | |
")\n", | |
"\n", | |
"# 精度出力\n", | |
"kfold = KFold(n_splits=10, random_state=seed)\n", | |
"cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=\"neg_mean_absolute_error\")\n", | |
"print(name, ': ', -np.mean(cv_results))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import matplotlib\n", | |
"matplotlib.rcParams[\"figure.figsize\"] = (16, 4)\n", | |
"plt.rcParams[\"font.family\"] = \"IPAexGothic\"\n", | |
"import logging\n", | |
"logging.basicConfig(level=logging.INFO)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"x = [49.5, 54.1, 5.37, 64, 691]\n", | |
"y = [2.26, 2.14, 3.04, 2.16, 2.20]\n", | |
"labels = ['単純比較', 'Stacking', 'Blending', 'Weighted Average', 'Stacknet']\n", | |
"\n", | |
"fig, ax = plt.subplots(figsize=(16, 8))\n", | |
"ax.scatter(x, y, s=600, c=\"pink\", alpha=0.5)\n", | |
"\n", | |
"for i, txt in enumerate(labels):\n", | |
" if i == 1:\n", | |
" ax.annotate(txt, (x[i], y[i]-.04), fontsize=20)\n", | |
" else:\n", | |
" ax.annotate(txt, (x[i], y[i]+.01), fontsize=15)\n", | |
"ax.set_ylabel('Absolute mean error', fontsize=20)\n", | |
"ax.set_xlabel('秒数', fontsize=20)\n", | |
"ax.set_xscale('log')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment