Skip to content

Instantly share code, notes, and snippets.

@cheevahagadog
Last active November 17, 2020 05:36
Show Gist options
  • Save cheevahagadog/756e88fd241cee5e053e170ea7eab1e5 to your computer and use it in GitHub Desktop.
Save cheevahagadog/756e88fd241cee5e053e170ea7eab1e5 to your computer and use it in GitHub Desktop.
Builds off the SHAP package to list out the feature effects per row on an XGBoost model.
#!/usr/bin/env python
# Python 3.6.4
import numpy as np
import pandas as pd
import iml
import xgboost
import shap
from tqdm import tqdm
def calculate_top_contributors(shap_values, features=None, feature_names=None, use_abs=False, return_df=False,
n_features=5):
""" Adapted from the SHAP package for visualizing the contributions of features towards a prediction.
https://github.com/slundberg/shap
Args:
shap_values: np.array of floats
features: pandas.core.series.Series, the data with the values
feature_names: list, all the feature names/ column names
use_abs: bool, if True, will sort the data by the absolute value of the feature effect
return_df: bool, if True, will return a pandas dataframe, else will return a list of feature, effect, value
n_features: int, the number of features to report on. If it equals -1 it will return the entire dataframe
Returns:
if return_df is True: returns a pandas dataframe
if return_df is False: returns a flattened list by name, effect, and value
"""
assert not type(shap_values) == list, "The shap_values arg looks looks multi output, try shap_values[i]."
assert len(shap_values.shape) == 1, "Expected just one row. Please only submit one row at a time."
shap_values = np.reshape(shap_values, (1, len(shap_values)))
instance = iml.Instance(np.zeros((1, len(feature_names))), features)
link = iml.links.convert_to_link('identity')
# explanation obj
expl = iml.explanations.AdditiveExplanation(
shap_values[0, -1], # base value
np.sum(shap_values[0, :]), # this row's prediction value
shap_values[0, :-1], # matrix
None,
instance, # <iml.common.Instance object >
link, # 'identity'
iml.Model(None, ["output value"]), # <iml.common.Model object >
iml.datatypes.DenseData(np.zeros((1, len(feature_names))), list(feature_names))
)
# Get the name, effect and value for each feature, if there was an effect
features_ = {}
for i in range(len(expl.data.group_names)):
if expl.effects[i] != 0:
features_[i] = {
"effect": ensure_not_numpy(expl.effects[i]),
"value": ensure_not_numpy(expl.instance.group_display_values[i]),
"name": expl.data.group_names[i]
}
effect_df = pd.DataFrame([v for k, v in features_.items()])
if use_abs: # get the absolute value of effect
effect_df['abs_effect'] = effect_df['effect'].apply(np.abs)
effect_df.sort_values('abs_effect', ascending=False, inplace=True)
else:
effect_df.sort_values('effect', ascending=False, inplace=True)
if not n_features == -1:
effect_df = effect_df.head(n_features)
if return_df:
return effect_df.reset_index(drop=True)
else:
list_of_info = list(zip(effect_df.name, effect_df.effect, effect_df.value))
effect_list = list(sum(list_of_info, ())) # flattens the list of tuples
return effect_list
def create_prediction_factors_df(contribs, X, clf):
"""Takes in the report df, contribs, previous eval df, and the model
Args:
contribs: numpy matrix
X: pandas DataFrame
clf: XGBoost classifier model
Returns:
pd.DataFrame of the factors
"""
factors = []
for i, row in tqdm(X.iterrows()):
vals = calculate_top_contributors(shap_values=contribs[i, :], features=X.iloc[i, :],
feature_names=clf.feature_names)
factors.append(vals)
df = pd.DataFrame(factors, columns=['F1', 'F1_effect', 'F1_value', 'F2', 'F2_effect', 'F2_value',
'F3', 'F3_effect', 'F3_value', 'F4', 'F4_effect', 'F4_value',
'F5', 'F5_effect', 'F5_value',])
return df
def ensure_not_numpy(x):
"""Helper function borrowed from the iml package"""
if isinstance(x, bytes):
return x.decode()
elif isinstance(x, np.str):
return str(x)
elif isinstance(x, np.generic):
return float(np.asscalar(x))
else:
return x
if __name__ == '__main__':
# train XGBoost model
X, y = shap.datasets.boston()
bst = xgboost.train({"learning_rate": 0.1}, xgboost.DMatrix(X, label=y), 100)
# explain the model's predictions using SHAP values (use pred_contrib in LightGBM)
shap_values = bst.predict(xgboost.DMatrix(X), pred_contribs=True)
# just the regular predictions
pred_prob = bst.predict(xgboost.DMatrix(X))
# or as labels
pred_label = np.round(pred_prob)
# for just the first observation
vals = calculate_top_contributors(shap_values=shap_values[0, :], features=X.iloc[0, :],
feature_names=clf.feature_names, use_abs=False)
# or as a dataframe
factors_df = create_prediction_factors_df(shap_values, X, clf=bst)
@cheevahagadog
Copy link
Author

I haven't tried this with another model but as long as the catboost model implements the "feature_names" attribute you should be good to use the create_prediction_factors_df function.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment