Eryk Lewinson erykml

Data Scientist, Author of the Python for Finance Cookbook (published by Packt).

erykml / treeinterpreter_dd.py

Created February 11, 2019 22:27

	prediction1, bias1, contributions1 = ti.predict(rf, np.array([selected_df[0]]), joint_contribution=True)
	prediction2, bias2, contributions2 = ti.predict(rf, np.array([selected_df[1]]), joint_contribution=True)

	aggregated_contributions1 = utils.aggregated_contribution(contributions1)
	aggregated_contributions2 = utils.aggregated_contribution(contributions2)

	res = []
	for k in set(aggregated_contributions1.keys()).union(
	set(aggregated_contributions2.keys())):
	res.append(([X_train.columns[index] for index in k] ,

erykml / lime.py

Created February 11, 2019 22:30

	import lime
	import lime.lime_tabular

	explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
	mode = 'regression',
	feature_names = X_train.columns,
	categorical_features = [3],
	categorical_names = ['CHAS'],
	discretize_continuous = True)

erykml / if_sk.py

Created March 9, 2019 22:46

	mat = scipy.io.loadmat('cover.mat')
	X = pd.DataFrame(mat['X'])
	y = pd.Series([x[0] for x in mat['y']])

	# define % of anomalies
	anomalies_ratio = 0.009

	if_sk = IsolationForest(n_estimators = 100,
	max_samples = 256,
	contamination = anomalies_ratio,

erykml / if_eif.py

Created March 9, 2019 22:52

	if_eif = iso.iForest(X.values,
	ntrees = 100,
	sample_size = 256,
	ExtensionLevel = 0)

	# calculate anomaly scores
	anomaly_scores = if_eif.compute_paths(X_in = X.values)
	# sort the scores
	anomaly_scores_sorted = np.argsort(anomaly_scores)
	# retrieve indices of anomalous observations

erykml / pp_plot.py

Last active April 14, 2019 21:05

	def pp_plot(x, dist, line=True, ax=None):
	'''
	Function for comparing empirical data to a theoretical distribution by using a P-P plot.

	Params:
	x - empirical data
	dist - distribution object from scipy.stats; for example scipy.stats.norm(0, 1)
	line - boolean; specify if the reference line (y=x) should be drawn on the plot
	ax - specified ax for subplots, None is standalone
	'''

erykml / pp_plots.py

Last active April 14, 2019 23:20

	fig, ax = plt.subplots(1, 2, figsize=(15, 8))

	fig.suptitle('PP-plots', fontsize=22)

	sm.ProbPlot(rv_norm, scs.norm, loc=0, scale=1).ppplot(line='45', ax=ax[0])
	ax[0].set_title('Statsmodels', fontsize=16)

	pp_plot(rv_norm, scs.norm(loc=0, scale=1), ax=ax[1])
	ax[1].set_title('pp_plot', fontsize=16)

erykml / skew_norm_qqplot.py

Created April 14, 2019 23:20

	fig, ax = plt.subplots(1, 2, figsize=(15, 8))

	sm.ProbPlot(rv_skew_norm).qqplot(line='s', ax=ax[0]);
	ax[0].set_title('Q-Q plot (vs. Normal)', fontsize=16)

	sns.distplot(rv_std_norm, kde=False, norm_hist=True, color='blue', label='Standard Normal', ax=ax[1])
	sns.distplot(rv_skew_norm, kde=False, norm_hist=True, color='red', label='Skew Normal $\\alpha = 5$', ax=ax[1])
	plt.title('Comparison of distributions', fontsize=16)
	plt.legend();

erykml / skew_skew.py

Last active April 15, 2019 23:26

	sm.ProbPlot(rv_skew_norm, scs.skewnorm, distargs=(5, )).qqplot(line='s');
	plt.title('Q-Q plot (vs. Skew Normal)', fontsize=16);

erykml / skew_std_norm.py

Created April 15, 2019 19:29

	fig, ax = plt.subplots(1, 2, figsize=(15, 8))

	pp_x = sm.ProbPlot(rv_skew_norm, fit=False)
	pp_y = sm.ProbPlot(rv_std_norm, fit=False)
	fig = pp_x.qqplot(line='s', other=pp_y, ax=ax[0])
	ax[0].set_title('Q-Q plot (vs. Standard Normal)', fontsize=16)

	sns.distplot(rv_std_norm, kde=False, norm_hist=True, color='blue', label='Standard Normal', ax=ax[1])
	sns.distplot(rv_skew_norm, kde=False, norm_hist=True, color='red', label='Skew Normal $\\alpha = 5$', ax=ax[1])
	plt.title('Comparison of distributions', fontsize=16)

erykml / import_boston.py

Created June 3, 2019 16:20

	import pandas as pd
	from sklearn.datasets import load_boston

	# load data
	boston = load_boston()
	X = pd.DataFrame(boston.data, columns=boston.feature_names)
	X.drop('CHAS', axis=1, inplace=True)
	y = pd.Series(boston.target, name='MEDV')

	# inspect data