Created
December 3, 2019 16:48
-
-
Save wbhinton/fbed217826b75499bb1a65000f0340ef to your computer and use it in GitHub Desktop.
Outlier detection using PyOD. You need to narrow down the data to a two column DF with X,Y as the column names.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import MinMaxScaler | |
# Import models | |
from pyod.models.cblof import CBLOF | |
from pyod.models.feature_bagging import FeatureBagging | |
from pyod.models.hbos import HBOS | |
from pyod.models.iforest import IForest | |
from pyod.models.knn import KNN | |
from pyod.models.lof import LOF | |
import matplotlib | |
df['X1'] = df['X'] | |
df['Y1'] = df['Y'] | |
scaler = MinMaxScaler(feature_range=(0, 1)) | |
df[['X1','Y1']] = scaler.fit_transform(df[['X1','Y1']]) | |
df[['X1','Y1']].head() | |
X1 = df['X1'].values.reshape(-1,1) | |
X2 = df['Y1'].values.reshape(-1,1) | |
X = np.concatenate((X1,X2),axis=1) | |
random_state = np.random.RandomState(34) | |
outliers_fraction = 0.15 # The percentage of observations that are not similar to the rest of the dataset | |
# Define outlier detection tools to be compared, you can add or remove methods as needed. | |
## The last run dataset will add an outlier column with either a 0 or 1 to indicate outliers determined by the method. | |
classifiers = { | |
'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state), | |
'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state), | |
'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), | |
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), | |
'Average KNN': KNN(method='mean',contamination=outliers_fraction), | |
'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state,behaviour='new') | |
} | |
xx , yy = np.meshgrid(np.linspace(0,1 , 200), np.linspace(0, 1, 200)) | |
for i, (clf_name, clf) in enumerate(classifiers.items()): | |
clf.fit(X) | |
# predict raw anomaly score | |
scores_pred = clf.decision_function(X) * -1 | |
# prediction of a datapoint category outlier or inlier | |
y_pred = clf.predict(X) | |
n_inliers = len(y_pred) - np.count_nonzero(y_pred) | |
n_outliers = np.count_nonzero(y_pred == 1) | |
plt.figure(figsize=(10, 10)) | |
# copy of dataframe | |
dfx = df | |
dfx['outlier'] = y_pred.tolist() | |
# IX1 - inlier feature 1, IX2 - inlier feature 2 | |
IX1 = np.array(dfx['X1'][dfx['outlier'] == 0]).reshape(-1,1) | |
IX2 = np.array(dfx['Y1'][dfx['outlier'] == 0]).reshape(-1,1) | |
# OX1 - outlier feature 1, OX2 - outlier feature 2 | |
OX1 = dfx['X1'][dfx['outlier'] == 1].values.reshape(-1,1) | |
OX2 = dfx['Y1'][dfx['outlier'] == 1].values.reshape(-1,1) | |
print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name) | |
# threshold value to consider a datapoint inlier or outlier | |
threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction) | |
# decision function calculates the raw anomaly score for every point | |
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 | |
Z = Z.reshape(xx.shape) | |
# fill blue map colormap from minimum anomaly score to threshold value | |
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r) | |
# draw red contour line where anomaly score is equal to thresold | |
a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red') | |
# fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score | |
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange') | |
b = plt.scatter(IX1,IX2, c='white',s=20, edgecolor='k') | |
c = plt.scatter(OX1,OX2, c='black',s=20, edgecolor='k') | |
plt.axis('tight') | |
# loc=2 is used for the top left corner | |
plt.legend( | |
[a.collections[0], b,c], | |
['learned decision function', 'inliers','outliers'], | |
prop=matplotlib.font_manager.FontProperties(size=20), | |
loc=2) | |
plt.xlim((0, 1)) | |
plt.ylim((0, 1)) | |
plt.title(clf_name) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment