bistaumanga · December 25, 2015 01:48
diff --git a/lof.py b/lof.py
 # -*- coding: utf-8 -*-
 """
 Created on Tue Feb 25 22:19:09 2014

 @author: bistaumanga

 Local Outlier factor implementation in python

 Implementations are in two version for calculating knn:
 1. Naive method
 2. using BallTree/ KDTree datastructures

 """

 import numpy as np
 import matplotlib.pyplot as plt
 import sys

 def exit():
    sys.exit()

       
 class LOF:
    def __init__(self, k = 3):
        self.k = k
        
    def generate_data(self, n = 500, dim = 3):
        
        n1, n2 = n / 3, n / 5
        n3 = n - n1 - n2
        
        # cluster of gaussian random data
        from sklearn.datasets import make_blobs
        data1, _ = make_blobs(n1, dim, centers= 3)
        
        # cluster of uniform random variable
        data2 = np.random.uniform(0, 25, size = (n2, dim))
        
        # cluster of dense uniform random variable
        data3 = np.random.uniform(100, 200, size = (n3, dim))
        
        # mix the three dataset
        self.data = np.vstack((np.vstack((data1, data2)), data3))
        np.random.shuffle(self.data)
        
        # add some noise : zipf is skewed distribution
        zipf_alpha = 2.5
        noise = np.random.zipf(zipf_alpha, (n,dim)) * \
                            np.sign((np.random.randint(2, size = (n, dim)) - 0.5))
        self.data += noise
         
    
    def _knn_naive(self):

        # distance between points
        import time
        tic = time.time()

        from sklearn.neighbors import DistanceMetric
        dist = DistanceMetric.get_metric('euclidean').pairwise(self.data)
        print '++ took %g msecs for Distance computation' %  ((time.time() - tic)* 1000)
        tic = time.time()
        # get the radius for each point in dataset
        # radius is the distance of kth nearest point for each point in dataset        
        self.idx_knn = np.argsort(dist, axis=1)[:,1 : self.k + 1] # by row' get k nearest neighbour   
        radius = np.linalg.norm(self.data - self.data[self.idx_knn[:, -1]], axis = 1) # radius
        print '+++ took %g msecs for KNN Querying' %  ((time.time() - tic)* 1000)
        # calculate the local reachability density
        LRD = []
        for i in range(self.idx_knn.shape[0]):
            LRD.append(np.mean(np.maximum(dist[i, self.idx_knn[i]], radius[self.idx_knn[i]])))
        return np.array(LRD)
        
    
    def _knn_tree(self):
    
        import time
        tic = time.time()
        # insert dataset to the BallTree data structure
        if self.data.shape[1] >= 20:        
            from sklearn.neighbors import BallTree as Tree
        else:
            from sklearn.neighbors import KDTree as Tree
        BT = Tree(self.data, leaf_size=5, p=2)
        
        # Query for k nearest, k + 1 because one of the returnee is self
        dx, self.idx_knn = BT.query(self.data[:, :], k = self.k + 1)
 #    
 #        from sklearn.neighbors import NearestNeighbors as NN
 #        knn = NN(n_neighbors = self.k + 1, leaf_size = 5, p = 2)
 #        knn.fit(self.data)
 #        dx, self.idx_knn = knn.kneighbors(self.data, return_distance = True)
        
        print '++ took %g msecs for Tree KNN Querying' %  ((time.time() - tic)* 1000)
        
        dx, self.idx_knn = dx[:, 1:], self.idx_knn[:, 1:]
        # get the radius for each point in dataset
        # radius is the distance of kth nearest point for each point in dataset        
        radius = dx[:, -1]
        # calculate the local reachability density
        LRD = np.mean(np.maximum(dx, radius[self.idx_knn]), axis = 1)
        return LRD

    def train(self, data = None, method = 'Naive') :
        
        # check if dataset is provided for training
        try:
            assert data != None and data.shape[0]
            self.data = data
            n = self.data.shape[0] # number of data points
            
        except AssertionError:
            try:
                n = self.data.shape[0] # number of data points
            except AttributeError:
                print 'No data to fit the model, please provide data or call generate_data method'
                exit()
        try:
            assert method.lower() in ['naive', 'n', 'tree', 't']
        except AssertionError:
            print 'Method must be Naive|n or tree|t'
            exit()
       
        # find the rho, which is inverse of  LRD
        if method.lower() in ['naive', 'n']:
            rho = 1./ self._knn_naive()
        
        elif method.lower() in ['tree', 't']:
            rho = 1./ self._knn_tree()

        self.score = np.sum(rho[self.idx_knn], axis = 1)/ np.array(rho, dtype = np.float16)
        self.score *= 1./self.k
        
    def plot(self, threshold = None):
        
        # set the threshold
        if not threshold:
            from scipy.stats.mstats import mquantiles
            threshold = max(mquantiles(self.score, prob = 0.95), 2.)
        self.threshold = threshold
        # reduce data to 2D if required
        if self.data.shape[1] > 2:
            from sklearn.decomposition import PCA
            pca = PCA(n_components = 2)
            self.data = pca.fit_transform(self.data)
            
        # plot non outliers as green
        plt.figure()
        plt.scatter(self.data[:, 0], self.data[:, 1], c = 'green', s = 10, edgecolors='None', alpha=0.5)
        
        # find the outliers and plot te outliers
        idx = np.where(self.score > self.threshold)
        plt.scatter(self.data[idx, 0], self.data[idx, 1], c = 'red', s = 10, edgecolors='None', alpha=0.5)
        plt.legend(['Normal', 'Outliers'])
        
        # plot the distribution of outlier score
        plt.figure()
        plt.hist(self.score, bins = 20)
        plt.title('Distribution of outlier score')
        
        
        
 def test(methods = ['Tree', 'Naive'], n = 200):

    _temp = LOF()
    _temp.generate_data(n = 100, dim = 3)
    data = _temp.data
    for i, m in enumerate(methods):
        lof = LOF(k = 7)
        lof.train(data = data, method = m)
        lof.plot()
        
        

 def perf_test(n_list = None, methods = ['Tree', 'Naive'], plot = False):
    
    import time
    if not n_list: n_list = [2 ** i for i in range(7, 14)]
    print methods
    result = []
    result.append(n_list)
    for m in methods:
        temp = []
        for n in n_list:
            tic = time.time()
            lof = LOF(k = 3)
            lof.generate_data(n = n, dim = 2)
            lof.train(method = m)
            temp.append(1000000 * (time.time()-tic))
            print 'Took %g msecs with %s method for %d datapoints\n' % \
                ((time.time() - tic) * 1000, m, n)
        result.append(temp)
    if plot:
        fig, ax = plt.subplots()
        ax.set_xscale('log', basex=2)
        ax.set_yscale('log', basey=10)
    
        plt.plot(result[0], result[1], 'm*-', ms = 10, mec = None)
        try :    
            plt.plot(result[0], result[2], 'co--', ms = 8, mec = None)
        except IndexError:
            pass
        plt.xlabel('Number of data points $n$')
        plt.ylabel('Time of execution $\mu secs$')
        plt.legend(methods, 'upper left')
        plt.show()
            
 if __name__ == "__main__":

 #    test(['Tree', 'Naive'], n = 1000)
    perf_test(methods = ['Tree', 'Naive'], n_list = [2 ** i for i in range(4, 14)], plot = True)
	# -- coding: utf-8 --
	"""
	Created on Tue Feb 25 22:19:09 2014

	@author: bistaumanga

	Local Outlier factor implementation in python

	Implementations are in two version for calculating knn:
	1. Naive method
	2. using BallTree/ KDTree datastructures

	"""

	import numpy as np
	import matplotlib.pyplot as plt
	import sys

	def exit():
	sys.exit()


	class LOF:
	def __init__(self, k = 3):
	self.k = k

	def generate_data(self, n = 500, dim = 3):

	n1, n2 = n / 3, n / 5
	n3 = n - n1 - n2

	# cluster of gaussian random data
	from sklearn.datasets import make_blobs
	data1, _ = make_blobs(n1, dim, centers= 3)

	# cluster of uniform random variable
	data2 = np.random.uniform(0, 25, size = (n2, dim))

	# cluster of dense uniform random variable
	data3 = np.random.uniform(100, 200, size = (n3, dim))

	# mix the three dataset
	self.data = np.vstack((np.vstack((data1, data2)), data3))
	np.random.shuffle(self.data)

	# add some noise : zipf is skewed distribution
	zipf_alpha = 2.5
	noise = np.random.zipf(zipf_alpha, (n,dim)) * \
	np.sign((np.random.randint(2, size = (n, dim)) - 0.5))
	self.data += noise


	def _knn_naive(self):

	# distance between points
	import time
	tic = time.time()

	from sklearn.neighbors import DistanceMetric
	dist = DistanceMetric.get_metric('euclidean').pairwise(self.data)
	print '++ took %g msecs for Distance computation' % ((time.time() - tic)* 1000)
	tic = time.time()
	# get the radius for each point in dataset
	# radius is the distance of kth nearest point for each point in dataset
	self.idx_knn = np.argsort(dist, axis=1)[:,1 : self.k + 1] # by row' get k nearest neighbour
	radius = np.linalg.norm(self.data - self.data[self.idx_knn[:, -1]], axis = 1) # radius
	print '+++ took %g msecs for KNN Querying' % ((time.time() - tic)* 1000)
	# calculate the local reachability density
	LRD = []
	for i in range(self.idx_knn.shape[0]):
	LRD.append(np.mean(np.maximum(dist[i, self.idx_knn[i]], radius[self.idx_knn[i]])))
	return np.array(LRD)


	def _knn_tree(self):

	import time
	tic = time.time()
	# insert dataset to the BallTree data structure
	if self.data.shape[1] >= 20:
	from sklearn.neighbors import BallTree as Tree
	else:
	from sklearn.neighbors import KDTree as Tree
	BT = Tree(self.data, leaf_size=5, p=2)

	# Query for k nearest, k + 1 because one of the returnee is self
	dx, self.idx_knn = BT.query(self.data[:, :], k = self.k + 1)
	#
	# from sklearn.neighbors import NearestNeighbors as NN
	# knn = NN(n_neighbors = self.k + 1, leaf_size = 5, p = 2)
	# knn.fit(self.data)
	# dx, self.idx_knn = knn.kneighbors(self.data, return_distance = True)

	print '++ took %g msecs for Tree KNN Querying' % ((time.time() - tic)* 1000)

	dx, self.idx_knn = dx[:, 1:], self.idx_knn[:, 1:]
	# get the radius for each point in dataset
	# radius is the distance of kth nearest point for each point in dataset
	radius = dx[:, -1]
	# calculate the local reachability density
	LRD = np.mean(np.maximum(dx, radius[self.idx_knn]), axis = 1)
	return LRD

	def train(self, data = None, method = 'Naive') :

	# check if dataset is provided for training
	try:
	assert data != None and data.shape[0]
	self.data = data
	n = self.data.shape[0] # number of data points

	except AssertionError:
	try:
	n = self.data.shape[0] # number of data points
	except AttributeError:
	print 'No data to fit the model, please provide data or call generate_data method'
	exit()
	try:
	assert method.lower() in ['naive', 'n', 'tree', 't']
	except AssertionError:
	print 'Method must be Naive\|n or tree\|t'
	exit()

	# find the rho, which is inverse of LRD
	if method.lower() in ['naive', 'n']:
	rho = 1./ self._knn_naive()

	elif method.lower() in ['tree', 't']:
	rho = 1./ self._knn_tree()

	self.score = np.sum(rho[self.idx_knn], axis = 1)/ np.array(rho, dtype = np.float16)
	self.score *= 1./self.k

	def plot(self, threshold = None):

	# set the threshold
	if not threshold:
	from scipy.stats.mstats import mquantiles
	threshold = max(mquantiles(self.score, prob = 0.95), 2.)
	self.threshold = threshold
	# reduce data to 2D if required
	if self.data.shape[1] > 2:
	from sklearn.decomposition import PCA
	pca = PCA(n_components = 2)
	self.data = pca.fit_transform(self.data)

	# plot non outliers as green
	plt.figure()
	plt.scatter(self.data[:, 0], self.data[:, 1], c = 'green', s = 10, edgecolors='None', alpha=0.5)

	# find the outliers and plot te outliers
	idx = np.where(self.score > self.threshold)
	plt.scatter(self.data[idx, 0], self.data[idx, 1], c = 'red', s = 10, edgecolors='None', alpha=0.5)
	plt.legend(['Normal', 'Outliers'])

	# plot the distribution of outlier score
	plt.figure()
	plt.hist(self.score, bins = 20)
	plt.title('Distribution of outlier score')



	def test(methods = ['Tree', 'Naive'], n = 200):

	_temp = LOF()
	_temp.generate_data(n = 100, dim = 3)
	data = _temp.data
	for i, m in enumerate(methods):
	lof = LOF(k = 7)
	lof.train(data = data, method = m)
	lof.plot()



	def perf_test(n_list = None, methods = ['Tree', 'Naive'], plot = False):

	import time
	if not n_list: n_list = [2 ** i for i in range(7, 14)]
	print methods
	result = []
	result.append(n_list)
	for m in methods:
	temp = []
	for n in n_list:
	tic = time.time()
	lof = LOF(k = 3)
	lof.generate_data(n = n, dim = 2)
	lof.train(method = m)
	temp.append(1000000 * (time.time()-tic))
	print 'Took %g msecs with %s method for %d datapoints\n' % \
	((time.time() - tic) * 1000, m, n)
	result.append(temp)
	if plot:
	fig, ax = plt.subplots()
	ax.set_xscale('log', basex=2)
	ax.set_yscale('log', basey=10)

	plt.plot(result[0], result[1], 'm*-', ms = 10, mec = None)
	try :
	plt.plot(result[0], result[2], 'co--', ms = 8, mec = None)
	except IndexError:
	pass
	plt.xlabel('Number of data points $n$')
	plt.ylabel('Time of execution $\mu secs$')
	plt.legend(methods, 'upper left')
	plt.show()

	if __name__ == "__main__":

	# test(['Tree', 'Naive'], n = 1000)
	perf_test(methods = ['Tree', 'Naive'], n_list = [2 ** i for i in range(4, 14)], plot = True)