Last active
November 17, 2020 05:36
-
-
Save cheevahagadog/756e88fd241cee5e053e170ea7eab1e5 to your computer and use it in GitHub Desktop.
Builds off the SHAP package to list out the feature effects per row on an XGBoost model.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Python 3.6.4 | |
import numpy as np | |
import pandas as pd | |
import iml | |
import xgboost | |
import shap | |
from tqdm import tqdm | |
def calculate_top_contributors(shap_values, features=None, feature_names=None, use_abs=False, return_df=False, | |
n_features=5): | |
""" Adapted from the SHAP package for visualizing the contributions of features towards a prediction. | |
https://github.com/slundberg/shap | |
Args: | |
shap_values: np.array of floats | |
features: pandas.core.series.Series, the data with the values | |
feature_names: list, all the feature names/ column names | |
use_abs: bool, if True, will sort the data by the absolute value of the feature effect | |
return_df: bool, if True, will return a pandas dataframe, else will return a list of feature, effect, value | |
n_features: int, the number of features to report on. If it equals -1 it will return the entire dataframe | |
Returns: | |
if return_df is True: returns a pandas dataframe | |
if return_df is False: returns a flattened list by name, effect, and value | |
""" | |
assert not type(shap_values) == list, "The shap_values arg looks looks multi output, try shap_values[i]." | |
assert len(shap_values.shape) == 1, "Expected just one row. Please only submit one row at a time." | |
shap_values = np.reshape(shap_values, (1, len(shap_values))) | |
instance = iml.Instance(np.zeros((1, len(feature_names))), features) | |
link = iml.links.convert_to_link('identity') | |
# explanation obj | |
expl = iml.explanations.AdditiveExplanation( | |
shap_values[0, -1], # base value | |
np.sum(shap_values[0, :]), # this row's prediction value | |
shap_values[0, :-1], # matrix | |
None, | |
instance, # <iml.common.Instance object > | |
link, # 'identity' | |
iml.Model(None, ["output value"]), # <iml.common.Model object > | |
iml.datatypes.DenseData(np.zeros((1, len(feature_names))), list(feature_names)) | |
) | |
# Get the name, effect and value for each feature, if there was an effect | |
features_ = {} | |
for i in range(len(expl.data.group_names)): | |
if expl.effects[i] != 0: | |
features_[i] = { | |
"effect": ensure_not_numpy(expl.effects[i]), | |
"value": ensure_not_numpy(expl.instance.group_display_values[i]), | |
"name": expl.data.group_names[i] | |
} | |
effect_df = pd.DataFrame([v for k, v in features_.items()]) | |
if use_abs: # get the absolute value of effect | |
effect_df['abs_effect'] = effect_df['effect'].apply(np.abs) | |
effect_df.sort_values('abs_effect', ascending=False, inplace=True) | |
else: | |
effect_df.sort_values('effect', ascending=False, inplace=True) | |
if not n_features == -1: | |
effect_df = effect_df.head(n_features) | |
if return_df: | |
return effect_df.reset_index(drop=True) | |
else: | |
list_of_info = list(zip(effect_df.name, effect_df.effect, effect_df.value)) | |
effect_list = list(sum(list_of_info, ())) # flattens the list of tuples | |
return effect_list | |
def create_prediction_factors_df(contribs, X, clf): | |
"""Takes in the report df, contribs, previous eval df, and the model | |
Args: | |
contribs: numpy matrix | |
X: pandas DataFrame | |
clf: XGBoost classifier model | |
Returns: | |
pd.DataFrame of the factors | |
""" | |
factors = [] | |
for i, row in tqdm(X.iterrows()): | |
vals = calculate_top_contributors(shap_values=contribs[i, :], features=X.iloc[i, :], | |
feature_names=clf.feature_names) | |
factors.append(vals) | |
df = pd.DataFrame(factors, columns=['F1', 'F1_effect', 'F1_value', 'F2', 'F2_effect', 'F2_value', | |
'F3', 'F3_effect', 'F3_value', 'F4', 'F4_effect', 'F4_value', | |
'F5', 'F5_effect', 'F5_value',]) | |
return df | |
def ensure_not_numpy(x): | |
"""Helper function borrowed from the iml package""" | |
if isinstance(x, bytes): | |
return x.decode() | |
elif isinstance(x, np.str): | |
return str(x) | |
elif isinstance(x, np.generic): | |
return float(np.asscalar(x)) | |
else: | |
return x | |
if __name__ == '__main__': | |
# train XGBoost model | |
X, y = shap.datasets.boston() | |
bst = xgboost.train({"learning_rate": 0.1}, xgboost.DMatrix(X, label=y), 100) | |
# explain the model's predictions using SHAP values (use pred_contrib in LightGBM) | |
shap_values = bst.predict(xgboost.DMatrix(X), pred_contribs=True) | |
# just the regular predictions | |
pred_prob = bst.predict(xgboost.DMatrix(X)) | |
# or as labels | |
pred_label = np.round(pred_prob) | |
# for just the first observation | |
vals = calculate_top_contributors(shap_values=shap_values[0, :], features=X.iloc[0, :], | |
feature_names=clf.feature_names, use_abs=False) | |
# or as a dataframe | |
factors_df = create_prediction_factors_df(shap_values, X, clf=bst) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I haven't tried this with another model but as long as the catboost model implements the "feature_names" attribute you should be good to use the create_prediction_factors_df function.