Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save noleto/987eb668e785a69e87ebf29f56fda55d to your computer and use it in GitHub Desktop.
Save noleto/987eb668e785a69e87ebf29f56fda55d to your computer and use it in GitHub Desktop.
XGboost and ELI5 - misleading local feature contribution
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import xgboost as xgb\n",
"import eli5\n",
"import sklearn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"XGBoost version: 0.72.1\n",
"ELI5 version: 0.8\n",
"Pandas version: 0.23.3\n",
"Numpy version: 1.14.5\n",
"Sckit-learn version: 0.19.2\n"
]
}
],
"source": [
"print(f\"XGBoost version: {xgb.__version__}\")\n",
"print(f\"ELI5 version: {eli5.__version__}\")\n",
"print(f\"Pandas version: {pd.__version__}\")\n",
"print(f\"Numpy version: {np.__version__}\")\n",
"print(f\"Sckit-learn version: {sklearn.__version__}\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"samples = 100\n",
"np.random.seed(1984)\n",
"X = np.random.uniform(0, 1, (samples, 14))\n",
"\n",
"np.random.seed(55)\n",
"y = X[:,0]*X[:,3] + (X[:,2] - 0.5)**2 + np.abs(X[:,7])*np.power(X[:,8], 2) + np.random.normal(0, 0.2, samples)\n",
"\n",
"features_names = [(\"x%s\" % i) for i in range(1, 15)]\n",
"simulated_data = pd.DataFrame(X, columns=features_names)\n",
"simulated_data[\"target\"] = y"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(simulated_data[features_names],\n",
" simulated_data.target,\n",
" test_size=0.30, \n",
" random_state=180)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
" colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
" max_depth=2, min_child_weight=1, missing=None, n_estimators=2,\n",
" n_jobs=1, nthread=None, objective='reg:linear', random_state=0,\n",
" reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,\n",
" subsample=1)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"estimator = xgb.XGBRegressor(n_estimators=2, max_depth=2, seed=0)\n",
"estimator.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## test\n",
"index_instance = 0\n",
"single_instance = X_test.iloc[[index_instance]]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Right: 0.9752943770152533\n",
"XGB Prediction: 0.5744506\n"
]
}
],
"source": [
"print(\"Right:\", y_test.iloc[index_instance])\n",
"print(\"XGB Prediction:\", estimator.predict(single_instance)[0])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Local explanation for XGB\n"
]
},
{
"data": {
"text/html": [
"\n",
" <style>\n",
" table.eli5-weights tr:hover {\n",
" filter: brightness(85%);\n",
" }\n",
"</style>\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" <p style=\"margin-bottom: 0.5em; margin-top: 0em\">\n",
" <b>\n",
" \n",
" y\n",
" \n",
"</b>\n",
"\n",
" \n",
" (score <b>0.074</b>)\n",
"\n",
"top features\n",
" </p>\n",
" \n",
" <table class=\"eli5-weights\"\n",
" style=\"border-collapse: collapse; border: none; margin-top: 0em; table-layout: auto; margin-bottom: 2em;\">\n",
" <thead>\n",
" <tr style=\"border: none;\">\n",
" \n",
" <th style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\" title=\"Feature contribution already accounts for the feature value (for linear models, contribution = weight * feature value), and the sum of feature contributions is equal to the score or, for some classifiers, to the probability. Feature values are shown if &quot;show_feature_values&quot; is True.\">\n",
" Contribution<sup>?</sup>\n",
" </th>\n",
" \n",
" <th style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">Feature</th>\n",
" \n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" \n",
" <tr style=\"background-color: hsl(120, 100.00%, 80.00%); border: none;\">\n",
" <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
" +0.055\n",
" </td>\n",
" <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
" x7\n",
" </td>\n",
" \n",
"</tr>\n",
" \n",
" <tr style=\"background-color: hsl(120, 100.00%, 87.00%); border: none;\">\n",
" <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
" +0.030\n",
" </td>\n",
" <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
" x8\n",
" </td>\n",
" \n",
"</tr>\n",
" \n",
" \n",
"\n",
" \n",
" \n",
" <tr style=\"background-color: hsl(0, 100.00%, 93.94%); border: none;\">\n",
" <td style=\"padding: 0 1em 0 0.5em; text-align: right; border: none;\">\n",
" -0.010\n",
" </td>\n",
" <td style=\"padding: 0 0.5em 0 0.5em; text-align: left; border: none;\">\n",
" &lt;BIAS&gt;\n",
" </td>\n",
" \n",
"</tr>\n",
" \n",
"\n",
" </tbody>\n",
" </table>\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
" \n",
"\n",
"\n",
"\n"
],
"text/plain": [
"Explanation(estimator='<xgboost.core.Booster object at 0x7f8564fbc0b8>', description='\\nFeatures with largest coefficients.\\n\\nFeature weights are calculated by following decision paths in trees\\nof an ensemble. Each leaf has an output score, and expected scores can also be\\nassigned to parent nodes. Contribution of one feature on the decision path\\nis how much expected score changes from parent to child. Weights of all \\nfeatures sum to the output score of the estimator.\\n\\nCaveats:\\n1. Feature weights just show if the feature contributed positively or\\n negatively to the final score, and does show how increasing or decreasing\\n the feature value will change the prediction.\\n2. In some cases, feature weight can be close to zero for an important feature.\\n For example, in a single tree that computes XOR function, the feature at the\\n top of the tree will have zero weight because expected scores for both\\n branches are equal, so decision at the top feature does not change the\\n expected score. For an ensemble predicting XOR functions it might not be\\n a problem, but it is not reliable if most trees happen to choose the same\\n feature at the top.\\n', error=None, method='decision paths', is_regression=True, targets=[TargetExplanation(target='y', feature_weights=FeatureWeights(pos=[FeatureWeight(feature='x7', weight=0.05479164583692307, std=None, value=0.6577437793296175), FeatureWeight(feature='x8', weight=0.029618746772362637, std=None, value=0.6823370055471244)], neg=[FeatureWeight(feature='<BIAS>', weight=-0.009959802909285713, std=None, value=1.0)], pos_remaining=0, neg_remaining=0), proba=None, score=0.0744505897, weighted_spans=None)], feature_importances=None, decision_tree=None, highlight_spaces=None, transition_features=None)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Local explanation for XGB\")\n",
"eli5.explain_prediction(estimator.get_booster(), single_instance)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0:[x9<0.446416348] yes=1,no=2,missing=1,gain=2.26438522,cover=70\n",
"\t1:[x3<0.664960206] yes=3,no=4,missing=3,gain=0.766629934,cover=31\n",
"\t\t3:leaf=-0.0331465006,cover=24\n",
"\t\t4:leaf=0.00379290315,cover=7\n",
"\t2:[x8<0.454566181] yes=5,no=6,missing=5,gain=2.4569304,cover=39\n",
"\t\t5:leaf=-0.012301621,cover=21\n",
"\t\t6:leaf=0.036901962,cover=18\n",
"\n"
]
}
],
"source": [
"## debugging trees\n",
"print(estimator.get_booster().get_dump(with_stats=True)[0])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0:[x9<0.446416348] yes=1,no=2,missing=1,gain=1.85169756,cover=70\n",
"\t1:[x3<0.664960206] yes=3,no=4,missing=3,gain=0.627843201,cover=31\n",
"\t\t3:leaf=-0.0299644377,cover=24\n",
"\t\t4:leaf=0.00346102612,cover=7\n",
"\t2:[x8<0.522416055] yes=5,no=6,missing=5,gain=2.0536201,cover=39\n",
"\t\t5:leaf=-0.00843466166,cover=24\n",
"\t\t6:leaf=0.0375486277,cover=15\n",
"\n"
]
}
],
"source": [
"print(estimator.get_booster().get_dump(with_stats=True)[1])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([6, 6], dtype=int32)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"leaf_ids, = estimator.get_booster().predict(xgb.DMatrix(single_instance, feature_names=single_instance.columns),\n",
" pred_leaf=True)\n",
"leaf_ids"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0744505897\n"
]
}
],
"source": [
"##leaf for first tree: 6:leaf=0.036901962,cover=18\n",
"##leaf second tree: 6:leaf=0.0375486277,cover=15\n",
"print(0.036901962+0.0375486277)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5744505897\n"
]
}
],
"source": [
"##eli5 comptued score does not take into account the XGBoost base_score\n",
"print(0.036901962+0.0375486277+estimator.base_score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment