wbhinton · December 3, 2019 16:48
diff --git a/outlier-pyod.py b/outlier-pyod.py
 from sklearn.preprocessing import MinMaxScaler
 # Import models
 from pyod.models.cblof import CBLOF
 from pyod.models.feature_bagging import FeatureBagging
 from pyod.models.hbos import HBOS
 from pyod.models.iforest import IForest
 from pyod.models.knn import KNN
 from pyod.models.lof import LOF
 import matplotlib
 df['X1'] = df['X']
 df['Y1'] = df['Y']

 scaler = MinMaxScaler(feature_range=(0, 1))
 df[['X1','Y1']] = scaler.fit_transform(df[['X1','Y1']])
 df[['X1','Y1']].head()

 X1 = df['X1'].values.reshape(-1,1)
 X2 = df['Y1'].values.reshape(-1,1)

 X = np.concatenate((X1,X2),axis=1)

 random_state = np.random.RandomState(34)
 outliers_fraction = 0.15 # The percentage of observations that are not similar to the rest of the dataset
 # Define outlier detection tools to be compared, you can add or remove methods as needed. 
 ## The last run dataset will add an outlier column with either a 0 or 1 to indicate outliers determined by the method.
 classifiers = {
        
        'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
        'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
       
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Average KNN': KNN(method='mean',contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state,behaviour='new')
 }

 xx , yy = np.meshgrid(np.linspace(0,1 , 200), np.linspace(0, 1, 200))

 for i, (clf_name, clf) in enumerate(classifiers.items()):
    clf.fit(X)
    # predict raw anomaly score
    scores_pred = clf.decision_function(X) * -1
        
    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)
    plt.figure(figsize=(10, 10))
    
    # copy of dataframe
    dfx = df
    dfx['outlier'] = y_pred.tolist()
    
    # IX1 - inlier feature 1,  IX2 - inlier feature 2
    IX1 =  np.array(dfx['X1'][dfx['outlier'] == 0]).reshape(-1,1)
    IX2 =  np.array(dfx['Y1'][dfx['outlier'] == 0]).reshape(-1,1)
    
    # OX1 - outlier feature 1, OX2 - outlier feature 2
    OX1 =  dfx['X1'][dfx['outlier'] == 1].values.reshape(-1,1)
    OX2 =  dfx['Y1'][dfx['outlier'] == 1].values.reshape(-1,1)
         
    print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name)
        
    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)
        
    # decision function calculates the raw anomaly score for every point
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)
          
    # fill blue map colormap from minimum anomaly score to threshold value
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
        
    # draw red contour line where anomaly score is equal to thresold
    a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
        
    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
        
    b = plt.scatter(IX1,IX2, c='white',s=20, edgecolor='k')
    
    c = plt.scatter(OX1,OX2, c='black',s=20, edgecolor='k')
       
    plt.axis('tight')  
    
    # loc=2 is used for the top left corner 
    plt.legend(
        [a.collections[0], b,c],
        ['learned decision function', 'inliers','outliers'],
        prop=matplotlib.font_manager.FontProperties(size=20),
        loc=2)
      
    plt.xlim((0, 1))
    plt.ylim((0, 1))
    plt.title(clf_name)
    plt.show()
	from sklearn.preprocessing import MinMaxScaler
	# Import models
	from pyod.models.cblof import CBLOF
	from pyod.models.feature_bagging import FeatureBagging
	from pyod.models.hbos import HBOS
	from pyod.models.iforest import IForest
	from pyod.models.knn import KNN
	from pyod.models.lof import LOF
	import matplotlib
	df['X1'] = df['X']
	df['Y1'] = df['Y']

	scaler = MinMaxScaler(feature_range=(0, 1))
	df[['X1','Y1']] = scaler.fit_transform(df[['X1','Y1']])
	df[['X1','Y1']].head()

	X1 = df['X1'].values.reshape(-1,1)
	X2 = df['Y1'].values.reshape(-1,1)

	X = np.concatenate((X1,X2),axis=1)

	random_state = np.random.RandomState(34)
	outliers_fraction = 0.15 # The percentage of observations that are not similar to the rest of the dataset
	# Define outlier detection tools to be compared, you can add or remove methods as needed.
	## The last run dataset will add an outlier column with either a 0 or 1 to indicate outliers determined by the method.
	classifiers = {

	'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
	'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
	'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),

	'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
	'Average KNN': KNN(method='mean',contamination=outliers_fraction),
	'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state,behaviour='new')
	}

	xx , yy = np.meshgrid(np.linspace(0,1 , 200), np.linspace(0, 1, 200))

	for i, (clf_name, clf) in enumerate(classifiers.items()):
	clf.fit(X)
	# predict raw anomaly score
	scores_pred = clf.decision_function(X) * -1

	# prediction of a datapoint category outlier or inlier
	y_pred = clf.predict(X)
	n_inliers = len(y_pred) - np.count_nonzero(y_pred)
	n_outliers = np.count_nonzero(y_pred == 1)
	plt.figure(figsize=(10, 10))

	# copy of dataframe
	dfx = df
	dfx['outlier'] = y_pred.tolist()

	# IX1 - inlier feature 1, IX2 - inlier feature 2
	IX1 = np.array(dfx['X1'][dfx['outlier'] == 0]).reshape(-1,1)
	IX2 = np.array(dfx['Y1'][dfx['outlier'] == 0]).reshape(-1,1)

	# OX1 - outlier feature 1, OX2 - outlier feature 2
	OX1 = dfx['X1'][dfx['outlier'] == 1].values.reshape(-1,1)
	OX2 = dfx['Y1'][dfx['outlier'] == 1].values.reshape(-1,1)

	print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name)

	# threshold value to consider a datapoint inlier or outlier
	threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)

	# decision function calculates the raw anomaly score for every point
	Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
	Z = Z.reshape(xx.shape)

	# fill blue map colormap from minimum anomaly score to threshold value
	plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)

	# draw red contour line where anomaly score is equal to thresold
	a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')

	# fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
	plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')

	b = plt.scatter(IX1,IX2, c='white',s=20, edgecolor='k')

	c = plt.scatter(OX1,OX2, c='black',s=20, edgecolor='k')

	plt.axis('tight')

	# loc=2 is used for the top left corner
	plt.legend(
	[a.collections[0], b,c],
	['learned decision function', 'inliers','outliers'],
	prop=matplotlib.font_manager.FontProperties(size=20),
	loc=2)

	plt.xlim((0, 1))
	plt.ylim((0, 1))
	plt.title(clf_name)
	plt.show()