Skip to content

Instantly share code, notes, and snippets.

@wbhinton
Created December 3, 2019 16:48
Show Gist options
  • Save wbhinton/fbed217826b75499bb1a65000f0340ef to your computer and use it in GitHub Desktop.
Save wbhinton/fbed217826b75499bb1a65000f0340ef to your computer and use it in GitHub Desktop.
Outlier detection using PyOD. You need to narrow down the data to a two column DF with X,Y as the column names.
from sklearn.preprocessing import MinMaxScaler
# Import models
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
import matplotlib
df['X1'] = df['X']
df['Y1'] = df['Y']
scaler = MinMaxScaler(feature_range=(0, 1))
df[['X1','Y1']] = scaler.fit_transform(df[['X1','Y1']])
df[['X1','Y1']].head()
X1 = df['X1'].values.reshape(-1,1)
X2 = df['Y1'].values.reshape(-1,1)
X = np.concatenate((X1,X2),axis=1)
random_state = np.random.RandomState(34)
outliers_fraction = 0.15 # The percentage of observations that are not similar to the rest of the dataset
# Define outlier detection tools to be compared, you can add or remove methods as needed.
## The last run dataset will add an outlier column with either a 0 or 1 to indicate outliers determined by the method.
classifiers = {
'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
'Average KNN': KNN(method='mean',contamination=outliers_fraction),
'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state,behaviour='new')
}
xx , yy = np.meshgrid(np.linspace(0,1 , 200), np.linspace(0, 1, 200))
for i, (clf_name, clf) in enumerate(classifiers.items()):
clf.fit(X)
# predict raw anomaly score
scores_pred = clf.decision_function(X) * -1
# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)
plt.figure(figsize=(10, 10))
# copy of dataframe
dfx = df
dfx['outlier'] = y_pred.tolist()
# IX1 - inlier feature 1, IX2 - inlier feature 2
IX1 = np.array(dfx['X1'][dfx['outlier'] == 0]).reshape(-1,1)
IX2 = np.array(dfx['Y1'][dfx['outlier'] == 0]).reshape(-1,1)
# OX1 - outlier feature 1, OX2 - outlier feature 2
OX1 = dfx['X1'][dfx['outlier'] == 1].values.reshape(-1,1)
OX2 = dfx['Y1'][dfx['outlier'] == 1].values.reshape(-1,1)
print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name)
# threshold value to consider a datapoint inlier or outlier
threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)
# decision function calculates the raw anomaly score for every point
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
Z = Z.reshape(xx.shape)
# fill blue map colormap from minimum anomaly score to threshold value
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
# draw red contour line where anomaly score is equal to thresold
a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
# fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
b = plt.scatter(IX1,IX2, c='white',s=20, edgecolor='k')
c = plt.scatter(OX1,OX2, c='black',s=20, edgecolor='k')
plt.axis('tight')
# loc=2 is used for the top left corner
plt.legend(
[a.collections[0], b,c],
['learned decision function', 'inliers','outliers'],
prop=matplotlib.font_manager.FontProperties(size=20),
loc=2)
plt.xlim((0, 1))
plt.ylim((0, 1))
plt.title(clf_name)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment