Skip to content

Instantly share code, notes, and snippets.

@bistaumanga
Last active December 25, 2015 01:48
Show Gist options
  • Save bistaumanga/9207578 to your computer and use it in GitHub Desktop.
Save bistaumanga/9207578 to your computer and use it in GitHub Desktop.
local outler factor
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 25 22:19:09 2014
@author: bistaumanga
Local Outlier factor implementation in python
Implementations are in two version for calculating knn:
1. Naive method
2. using BallTree/ KDTree datastructures
"""
import numpy as np
import matplotlib.pyplot as plt
import sys
def exit():
sys.exit()
class LOF:
def __init__(self, k = 3):
self.k = k
def generate_data(self, n = 500, dim = 3):
n1, n2 = n / 3, n / 5
n3 = n - n1 - n2
# cluster of gaussian random data
from sklearn.datasets import make_blobs
data1, _ = make_blobs(n1, dim, centers= 3)
# cluster of uniform random variable
data2 = np.random.uniform(0, 25, size = (n2, dim))
# cluster of dense uniform random variable
data3 = np.random.uniform(100, 200, size = (n3, dim))
# mix the three dataset
self.data = np.vstack((np.vstack((data1, data2)), data3))
np.random.shuffle(self.data)
# add some noise : zipf is skewed distribution
zipf_alpha = 2.5
noise = np.random.zipf(zipf_alpha, (n,dim)) * \
np.sign((np.random.randint(2, size = (n, dim)) - 0.5))
self.data += noise
def _knn_naive(self):
# distance between points
import time
tic = time.time()
from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('euclidean').pairwise(self.data)
print '++ took %g msecs for Distance computation' % ((time.time() - tic)* 1000)
tic = time.time()
# get the radius for each point in dataset
# radius is the distance of kth nearest point for each point in dataset
self.idx_knn = np.argsort(dist, axis=1)[:,1 : self.k + 1] # by row' get k nearest neighbour
radius = np.linalg.norm(self.data - self.data[self.idx_knn[:, -1]], axis = 1) # radius
print '+++ took %g msecs for KNN Querying' % ((time.time() - tic)* 1000)
# calculate the local reachability density
LRD = []
for i in range(self.idx_knn.shape[0]):
LRD.append(np.mean(np.maximum(dist[i, self.idx_knn[i]], radius[self.idx_knn[i]])))
return np.array(LRD)
def _knn_tree(self):
import time
tic = time.time()
# insert dataset to the BallTree data structure
if self.data.shape[1] >= 20:
from sklearn.neighbors import BallTree as Tree
else:
from sklearn.neighbors import KDTree as Tree
BT = Tree(self.data, leaf_size=5, p=2)
# Query for k nearest, k + 1 because one of the returnee is self
dx, self.idx_knn = BT.query(self.data[:, :], k = self.k + 1)
#
# from sklearn.neighbors import NearestNeighbors as NN
# knn = NN(n_neighbors = self.k + 1, leaf_size = 5, p = 2)
# knn.fit(self.data)
# dx, self.idx_knn = knn.kneighbors(self.data, return_distance = True)
print '++ took %g msecs for Tree KNN Querying' % ((time.time() - tic)* 1000)
dx, self.idx_knn = dx[:, 1:], self.idx_knn[:, 1:]
# get the radius for each point in dataset
# radius is the distance of kth nearest point for each point in dataset
radius = dx[:, -1]
# calculate the local reachability density
LRD = np.mean(np.maximum(dx, radius[self.idx_knn]), axis = 1)
return LRD
def train(self, data = None, method = 'Naive') :
# check if dataset is provided for training
try:
assert data != None and data.shape[0]
self.data = data
n = self.data.shape[0] # number of data points
except AssertionError:
try:
n = self.data.shape[0] # number of data points
except AttributeError:
print 'No data to fit the model, please provide data or call generate_data method'
exit()
try:
assert method.lower() in ['naive', 'n', 'tree', 't']
except AssertionError:
print 'Method must be Naive|n or tree|t'
exit()
# find the rho, which is inverse of LRD
if method.lower() in ['naive', 'n']:
rho = 1./ self._knn_naive()
elif method.lower() in ['tree', 't']:
rho = 1./ self._knn_tree()
self.score = np.sum(rho[self.idx_knn], axis = 1)/ np.array(rho, dtype = np.float16)
self.score *= 1./self.k
def plot(self, threshold = None):
# set the threshold
if not threshold:
from scipy.stats.mstats import mquantiles
threshold = max(mquantiles(self.score, prob = 0.95), 2.)
self.threshold = threshold
# reduce data to 2D if required
if self.data.shape[1] > 2:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
self.data = pca.fit_transform(self.data)
# plot non outliers as green
plt.figure()
plt.scatter(self.data[:, 0], self.data[:, 1], c = 'green', s = 10, edgecolors='None', alpha=0.5)
# find the outliers and plot te outliers
idx = np.where(self.score > self.threshold)
plt.scatter(self.data[idx, 0], self.data[idx, 1], c = 'red', s = 10, edgecolors='None', alpha=0.5)
plt.legend(['Normal', 'Outliers'])
# plot the distribution of outlier score
plt.figure()
plt.hist(self.score, bins = 20)
plt.title('Distribution of outlier score')
def test(methods = ['Tree', 'Naive'], n = 200):
_temp = LOF()
_temp.generate_data(n = 100, dim = 3)
data = _temp.data
for i, m in enumerate(methods):
lof = LOF(k = 7)
lof.train(data = data, method = m)
lof.plot()
def perf_test(n_list = None, methods = ['Tree', 'Naive'], plot = False):
import time
if not n_list: n_list = [2 ** i for i in range(7, 14)]
print methods
result = []
result.append(n_list)
for m in methods:
temp = []
for n in n_list:
tic = time.time()
lof = LOF(k = 3)
lof.generate_data(n = n, dim = 2)
lof.train(method = m)
temp.append(1000000 * (time.time()-tic))
print 'Took %g msecs with %s method for %d datapoints\n' % \
((time.time() - tic) * 1000, m, n)
result.append(temp)
if plot:
fig, ax = plt.subplots()
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=10)
plt.plot(result[0], result[1], 'm*-', ms = 10, mec = None)
try :
plt.plot(result[0], result[2], 'co--', ms = 8, mec = None)
except IndexError:
pass
plt.xlabel('Number of data points $n$')
plt.ylabel('Time of execution $\mu secs$')
plt.legend(methods, 'upper left')
plt.show()
if __name__ == "__main__":
# test(['Tree', 'Naive'], n = 1000)
perf_test(methods = ['Tree', 'Naive'], n_list = [2 ** i for i in range(4, 14)], plot = True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment