Last active
December 25, 2015 01:48
-
-
Save bistaumanga/9207578 to your computer and use it in GitHub Desktop.
local outler factor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Feb 25 22:19:09 2014 | |
@author: bistaumanga | |
Local Outlier factor implementation in python | |
Implementations are in two version for calculating knn: | |
1. Naive method | |
2. using BallTree/ KDTree datastructures | |
""" | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import sys | |
def exit(): | |
sys.exit() | |
class LOF: | |
def __init__(self, k = 3): | |
self.k = k | |
def generate_data(self, n = 500, dim = 3): | |
n1, n2 = n / 3, n / 5 | |
n3 = n - n1 - n2 | |
# cluster of gaussian random data | |
from sklearn.datasets import make_blobs | |
data1, _ = make_blobs(n1, dim, centers= 3) | |
# cluster of uniform random variable | |
data2 = np.random.uniform(0, 25, size = (n2, dim)) | |
# cluster of dense uniform random variable | |
data3 = np.random.uniform(100, 200, size = (n3, dim)) | |
# mix the three dataset | |
self.data = np.vstack((np.vstack((data1, data2)), data3)) | |
np.random.shuffle(self.data) | |
# add some noise : zipf is skewed distribution | |
zipf_alpha = 2.5 | |
noise = np.random.zipf(zipf_alpha, (n,dim)) * \ | |
np.sign((np.random.randint(2, size = (n, dim)) - 0.5)) | |
self.data += noise | |
def _knn_naive(self): | |
# distance between points | |
import time | |
tic = time.time() | |
from sklearn.neighbors import DistanceMetric | |
dist = DistanceMetric.get_metric('euclidean').pairwise(self.data) | |
print '++ took %g msecs for Distance computation' % ((time.time() - tic)* 1000) | |
tic = time.time() | |
# get the radius for each point in dataset | |
# radius is the distance of kth nearest point for each point in dataset | |
self.idx_knn = np.argsort(dist, axis=1)[:,1 : self.k + 1] # by row' get k nearest neighbour | |
radius = np.linalg.norm(self.data - self.data[self.idx_knn[:, -1]], axis = 1) # radius | |
print '+++ took %g msecs for KNN Querying' % ((time.time() - tic)* 1000) | |
# calculate the local reachability density | |
LRD = [] | |
for i in range(self.idx_knn.shape[0]): | |
LRD.append(np.mean(np.maximum(dist[i, self.idx_knn[i]], radius[self.idx_knn[i]]))) | |
return np.array(LRD) | |
def _knn_tree(self): | |
import time | |
tic = time.time() | |
# insert dataset to the BallTree data structure | |
if self.data.shape[1] >= 20: | |
from sklearn.neighbors import BallTree as Tree | |
else: | |
from sklearn.neighbors import KDTree as Tree | |
BT = Tree(self.data, leaf_size=5, p=2) | |
# Query for k nearest, k + 1 because one of the returnee is self | |
dx, self.idx_knn = BT.query(self.data[:, :], k = self.k + 1) | |
# | |
# from sklearn.neighbors import NearestNeighbors as NN | |
# knn = NN(n_neighbors = self.k + 1, leaf_size = 5, p = 2) | |
# knn.fit(self.data) | |
# dx, self.idx_knn = knn.kneighbors(self.data, return_distance = True) | |
print '++ took %g msecs for Tree KNN Querying' % ((time.time() - tic)* 1000) | |
dx, self.idx_knn = dx[:, 1:], self.idx_knn[:, 1:] | |
# get the radius for each point in dataset | |
# radius is the distance of kth nearest point for each point in dataset | |
radius = dx[:, -1] | |
# calculate the local reachability density | |
LRD = np.mean(np.maximum(dx, radius[self.idx_knn]), axis = 1) | |
return LRD | |
def train(self, data = None, method = 'Naive') : | |
# check if dataset is provided for training | |
try: | |
assert data != None and data.shape[0] | |
self.data = data | |
n = self.data.shape[0] # number of data points | |
except AssertionError: | |
try: | |
n = self.data.shape[0] # number of data points | |
except AttributeError: | |
print 'No data to fit the model, please provide data or call generate_data method' | |
exit() | |
try: | |
assert method.lower() in ['naive', 'n', 'tree', 't'] | |
except AssertionError: | |
print 'Method must be Naive|n or tree|t' | |
exit() | |
# find the rho, which is inverse of LRD | |
if method.lower() in ['naive', 'n']: | |
rho = 1./ self._knn_naive() | |
elif method.lower() in ['tree', 't']: | |
rho = 1./ self._knn_tree() | |
self.score = np.sum(rho[self.idx_knn], axis = 1)/ np.array(rho, dtype = np.float16) | |
self.score *= 1./self.k | |
def plot(self, threshold = None): | |
# set the threshold | |
if not threshold: | |
from scipy.stats.mstats import mquantiles | |
threshold = max(mquantiles(self.score, prob = 0.95), 2.) | |
self.threshold = threshold | |
# reduce data to 2D if required | |
if self.data.shape[1] > 2: | |
from sklearn.decomposition import PCA | |
pca = PCA(n_components = 2) | |
self.data = pca.fit_transform(self.data) | |
# plot non outliers as green | |
plt.figure() | |
plt.scatter(self.data[:, 0], self.data[:, 1], c = 'green', s = 10, edgecolors='None', alpha=0.5) | |
# find the outliers and plot te outliers | |
idx = np.where(self.score > self.threshold) | |
plt.scatter(self.data[idx, 0], self.data[idx, 1], c = 'red', s = 10, edgecolors='None', alpha=0.5) | |
plt.legend(['Normal', 'Outliers']) | |
# plot the distribution of outlier score | |
plt.figure() | |
plt.hist(self.score, bins = 20) | |
plt.title('Distribution of outlier score') | |
def test(methods = ['Tree', 'Naive'], n = 200): | |
_temp = LOF() | |
_temp.generate_data(n = 100, dim = 3) | |
data = _temp.data | |
for i, m in enumerate(methods): | |
lof = LOF(k = 7) | |
lof.train(data = data, method = m) | |
lof.plot() | |
def perf_test(n_list = None, methods = ['Tree', 'Naive'], plot = False): | |
import time | |
if not n_list: n_list = [2 ** i for i in range(7, 14)] | |
print methods | |
result = [] | |
result.append(n_list) | |
for m in methods: | |
temp = [] | |
for n in n_list: | |
tic = time.time() | |
lof = LOF(k = 3) | |
lof.generate_data(n = n, dim = 2) | |
lof.train(method = m) | |
temp.append(1000000 * (time.time()-tic)) | |
print 'Took %g msecs with %s method for %d datapoints\n' % \ | |
((time.time() - tic) * 1000, m, n) | |
result.append(temp) | |
if plot: | |
fig, ax = plt.subplots() | |
ax.set_xscale('log', basex=2) | |
ax.set_yscale('log', basey=10) | |
plt.plot(result[0], result[1], 'm*-', ms = 10, mec = None) | |
try : | |
plt.plot(result[0], result[2], 'co--', ms = 8, mec = None) | |
except IndexError: | |
pass | |
plt.xlabel('Number of data points $n$') | |
plt.ylabel('Time of execution $\mu secs$') | |
plt.legend(methods, 'upper left') | |
plt.show() | |
if __name__ == "__main__": | |
# test(['Tree', 'Naive'], n = 1000) | |
perf_test(methods = ['Tree', 'Naive'], n_list = [2 ** i for i in range(4, 14)], plot = True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment