Skip to content

Instantly share code, notes, and snippets.

View erykml's full-sized avatar

Eryk Lewinson erykml

View GitHub Profile
prediction1, bias1, contributions1 = ti.predict(rf, np.array([selected_df[0]]), joint_contribution=True)
prediction2, bias2, contributions2 = ti.predict(rf, np.array([selected_df[1]]), joint_contribution=True)
aggregated_contributions1 = utils.aggregated_contribution(contributions1)
aggregated_contributions2 = utils.aggregated_contribution(contributions2)
res = []
for k in set(aggregated_contributions1.keys()).union(
set(aggregated_contributions2.keys())):
res.append(([X_train.columns[index] for index in k] ,
import lime
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
mode = 'regression',
feature_names = X_train.columns,
categorical_features = [3],
categorical_names = ['CHAS'],
discretize_continuous = True)
mat = scipy.io.loadmat('cover.mat')
X = pd.DataFrame(mat['X'])
y = pd.Series([x[0] for x in mat['y']])
# define % of anomalies
anomalies_ratio = 0.009
if_sk = IsolationForest(n_estimators = 100,
max_samples = 256,
contamination = anomalies_ratio,
if_eif = iso.iForest(X.values,
ntrees = 100,
sample_size = 256,
ExtensionLevel = 0)
# calculate anomaly scores
anomaly_scores = if_eif.compute_paths(X_in = X.values)
# sort the scores
anomaly_scores_sorted = np.argsort(anomaly_scores)
# retrieve indices of anomalous observations
def pp_plot(x, dist, line=True, ax=None):
'''
Function for comparing empirical data to a theoretical distribution by using a P-P plot.
Params:
x - empirical data
dist - distribution object from scipy.stats; for example scipy.stats.norm(0, 1)
line - boolean; specify if the reference line (y=x) should be drawn on the plot
ax - specified ax for subplots, None is standalone
'''
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
fig.suptitle('PP-plots', fontsize=22)
sm.ProbPlot(rv_norm, scs.norm, loc=0, scale=1).ppplot(line='45', ax=ax[0])
ax[0].set_title('Statsmodels', fontsize=16)
pp_plot(rv_norm, scs.norm(loc=0, scale=1), ax=ax[1])
ax[1].set_title('pp_plot', fontsize=16)
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
sm.ProbPlot(rv_skew_norm).qqplot(line='s', ax=ax[0]);
ax[0].set_title('Q-Q plot (vs. Normal)', fontsize=16)
sns.distplot(rv_std_norm, kde=False, norm_hist=True, color='blue', label='Standard Normal', ax=ax[1])
sns.distplot(rv_skew_norm, kde=False, norm_hist=True, color='red', label='Skew Normal $\\alpha = 5$', ax=ax[1])
plt.title('Comparison of distributions', fontsize=16)
plt.legend();
sm.ProbPlot(rv_skew_norm, scs.skewnorm, distargs=(5, )).qqplot(line='s');
plt.title('Q-Q plot (vs. Skew Normal)', fontsize=16);
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
pp_x = sm.ProbPlot(rv_skew_norm, fit=False)
pp_y = sm.ProbPlot(rv_std_norm, fit=False)
fig = pp_x.qqplot(line='s', other=pp_y, ax=ax[0])
ax[0].set_title('Q-Q plot (vs. Standard Normal)', fontsize=16)
sns.distplot(rv_std_norm, kde=False, norm_hist=True, color='blue', label='Standard Normal', ax=ax[1])
sns.distplot(rv_skew_norm, kde=False, norm_hist=True, color='red', label='Skew Normal $\\alpha = 5$', ax=ax[1])
plt.title('Comparison of distributions', fontsize=16)
import pandas as pd
from sklearn.datasets import load_boston
# load data
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
X.drop('CHAS', axis=1, inplace=True)
y = pd.Series(boston.target, name='MEDV')
# inspect data