Last active
October 15, 2021 12:54
-
-
Save BioSciEconomist/c4964578f0fff67080fed37726b5e157 to your computer and use it in GitHub Desktop.
toy example using shap values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: toy SHAP.py | |
# | DATE: 10/14/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: toy example using shap values | |
# *---------------------------------------------------------------- | |
import numpy as np | |
import pandas as pd | |
import scipy.stats | |
import sklearn | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from sklearn import preprocessing | |
from sklearn.ensemble import RandomForestRegressor | |
# | |
# generate some data | |
# | |
data = {'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2], | |
'app' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0], | |
'age':[33,25,33,30,23,26,22,23,28,35,31,33,29,27], | |
'genderF':[1,1,0,1,0,1,1,1,1,0,0,0,1,1] | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['app','wtchg','age','genderF']) | |
# | |
# random forest model | |
# | |
# prep data | |
Y = df['wtchg'] | |
X = df[['app','age','genderF']] | |
# fit model | |
rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10) | |
rf.fit(X,Y) | |
# feature importance | |
print(rf.feature_importances_) | |
# visualize feature importance | |
importances = rf.feature_importances_ | |
indices = np.argsort(importances) | |
features = X.columns | |
plt.title('Feature Importances') | |
plt.barh(range(len(indices)), importances[indices], color='b', align='center') | |
plt.yticks(range(len(indices)), [features[i] for i in indices]) | |
plt.xlabel('Relative Importance') | |
plt.show() | |
# | |
# SHAP values | |
# | |
import shap | |
# calculate SHAP values | |
rf_shap_values = shap.KernelExplainer(rf.predict,X) | |
# define model | |
rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10) | |
model = rf.fit(X,Y) | |
# explain SHAP values | |
explainer = shap.Explainer(model) | |
shap_values = explainer(X) | |
# visualize SHAP values and feature dependencies | |
clust = shap.utils.hclust(X, Y, linkage="complete") | |
shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1) | |
# summary plot of SHAP values | |
shap.summary_plot(shap_values, X) | |
# | |
# example from documentation | |
# | |
# ref: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html | |
import pandas as pd | |
import shap | |
import sklearn | |
# a classic housing price dataset | |
X,y = shap.datasets.boston() | |
X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution | |
# a simple linear model | |
model = sklearn.linear_model.LinearRegression() | |
model.fit(X, y) | |
# model output | |
print("Model coefficients:\n") | |
for i in range(X.shape[1]): | |
print(X.columns[i], "=", model.coef_[i].round(4)) | |
# compute the SHAP values for the linear model | |
explainer = shap.Explainer(model.predict, X100) | |
shap_values = explainer(X) | |
# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind] | |
sample_ind = 18 | |
shap_values=shap_values[sample_ind:sample_ind+1,:] | |
shap.plots.waterfall(shap_values[sample_ind], max_display=14) | |
# shap bar plot | |
clust = shap.utils.hclust(X, y, linkage="complete") | |
shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1) | |
# summary plot | |
shap.summary_plot(shap_values, X) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment