BioSciEconomist · October 15, 2021 12:54
diff --git a/toy SHAP.py b/toy SHAP.py
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: toy SHAP.py
 # | DATE: 10/14/21
 # | CREATED BY: MATT BOGARD
 # | PROJECT FILE:
 # *----------------------------------------------------------------
 # | PURPOSE: toy example using shap values 
 # *----------------------------------------------------------------

 import numpy as np
 import pandas as pd
 import scipy.stats
 import sklearn


 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
 from sklearn import preprocessing
 from sklearn.ensemble import RandomForestRegressor

 #
 # generate some data
 #

 data = {'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
        'app' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
        'age':[33,25,33,30,23,26,22,23,28,35,31,33,29,27],
        'genderF':[1,1,0,1,0,1,1,1,1,0,0,0,1,1]

 }

 # convert to a data frame
 df = pd.DataFrame(data,columns=['app','wtchg','age','genderF'])

 #
 # random forest model
 #

 # prep data
 Y = df['wtchg']
 X =  df[['app','age','genderF']]

 # fit model
 rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
 rf.fit(X,Y)  

 # feature importance
 print(rf.feature_importances_)

 # visualize feature importance
 importances = rf.feature_importances_
 indices = np.argsort(importances)
 features = X.columns
 plt.title('Feature Importances')
 plt.barh(range(len(indices)), importances[indices], color='b', align='center')
 plt.yticks(range(len(indices)), [features[i] for i in indices])
 plt.xlabel('Relative Importance')
 plt.show()

 #
 # SHAP values
 #

 import shap

 # calculate SHAP values
 rf_shap_values = shap.KernelExplainer(rf.predict,X)

 # define model
 rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
 model = rf.fit(X,Y)  

 # explain SHAP values
 explainer = shap.Explainer(model)
 shap_values = explainer(X)

 # visualize SHAP values and feature dependencies
 clust = shap.utils.hclust(X, Y, linkage="complete")
 shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

 # summary plot of SHAP values
 shap.summary_plot(shap_values, X)


 #
 # example from documentation
 #

 # ref: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html 

 import pandas as pd
 import shap
 import sklearn

 # a classic housing price dataset
 X,y = shap.datasets.boston()
 X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution

 # a simple linear model
 model = sklearn.linear_model.LinearRegression()
 model.fit(X, y)

 # model output
 print("Model coefficients:\n")
 for i in range(X.shape[1]):
    print(X.columns[i], "=", model.coef_[i].round(4))

 # compute the SHAP values for the linear model
 explainer = shap.Explainer(model.predict, X100)
 shap_values = explainer(X)

 # the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind]
 sample_ind = 18
 shap_values=shap_values[sample_ind:sample_ind+1,:]

 shap.plots.waterfall(shap_values[sample_ind], max_display=14)


 # shap bar plot
 clust = shap.utils.hclust(X, y, linkage="complete")
 shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

 # summary plot
 shap.summary_plot(shap_values, X)
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: toy SHAP.py
	# \| DATE: 10/14/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: toy example using shap values
	# *----------------------------------------------------------------

	import numpy as np
	import pandas as pd
	import scipy.stats
	import sklearn


	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	from sklearn import preprocessing
	from sklearn.ensemble import RandomForestRegressor

	#
	# generate some data
	#

	data = {'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
	'app' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
	'age':[33,25,33,30,23,26,22,23,28,35,31,33,29,27],
	'genderF':[1,1,0,1,0,1,1,1,1,0,0,0,1,1]

	}

	# convert to a data frame
	df = pd.DataFrame(data,columns=['app','wtchg','age','genderF'])

	#
	# random forest model
	#

	# prep data
	Y = df['wtchg']
	X = df[['app','age','genderF']]

	# fit model
	rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
	rf.fit(X,Y)

	# feature importance
	print(rf.feature_importances_)

	# visualize feature importance
	importances = rf.feature_importances_
	indices = np.argsort(importances)
	features = X.columns
	plt.title('Feature Importances')
	plt.barh(range(len(indices)), importances[indices], color='b', align='center')
	plt.yticks(range(len(indices)), [features[i] for i in indices])
	plt.xlabel('Relative Importance')
	plt.show()

	#
	# SHAP values
	#

	import shap

	# calculate SHAP values
	rf_shap_values = shap.KernelExplainer(rf.predict,X)

	# define model
	rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
	model = rf.fit(X,Y)

	# explain SHAP values
	explainer = shap.Explainer(model)
	shap_values = explainer(X)

	# visualize SHAP values and feature dependencies
	clust = shap.utils.hclust(X, Y, linkage="complete")
	shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

	# summary plot of SHAP values
	shap.summary_plot(shap_values, X)


	#
	# example from documentation
	#

	# ref: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html

	import pandas as pd
	import shap
	import sklearn

	# a classic housing price dataset
	X,y = shap.datasets.boston()
	X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution

	# a simple linear model
	model = sklearn.linear_model.LinearRegression()
	model.fit(X, y)

	# model output
	print("Model coefficients:\n")
	for i in range(X.shape[1]):
	print(X.columns[i], "=", model.coef_[i].round(4))

	# compute the SHAP values for the linear model
	explainer = shap.Explainer(model.predict, X100)
	shap_values = explainer(X)

	# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind]
	sample_ind = 18
	shap_values=shap_values[sample_ind:sample_ind+1,:]

	shap.plots.waterfall(shap_values[sample_ind], max_display=14)


	# shap bar plot
	clust = shap.utils.hclust(X, y, linkage="complete")
	shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

	# summary plot
	shap.summary_plot(shap_values, X)