jhumigas · August 21, 2018 21:48
diff --git a/trust_score.py b/trust_score.py
 """
 The MIT License (MIT)
 Copyright (c) 2018 David Mugisha

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """  

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 from sklearn.neighbors import NearestNeighbors
 import numpy as np
 import scipy as sp

 def high_density_set(X, k_knn, alpha, knn_algorithm='ball_tree', metric='euclidean'):
    """Estimating high density set, to filter out alpha-fraction of the samples 
    with lowest density(which might be outliers).
    The parameter alpha should be determined using cross-validation on a training set
    
    Args:
        X(np.ndarray): (n_samples, n_features) numpy array holding all the samples
        k_knn(int): number of nearest neighbors to use to estimate the high density set 
        alpha(float): density threshold, i.e fraction of the samples to filter out
        knn_algorithm: string
    
    Returns:
        tuple: (
            np.ndarray: Samples kept 
            np.ndarray: Array of boolean used to select the samples
        )
    
    References:
        [1] Jiang, H., Kim, B., & Gupta, M. (2018). 
        To Trust Or Not To Trust A Classifier. 
        arXiv preprint arXiv:1805.11783.
    """
    nbrs = NearestNeighbors(n_neighbors=k_knn, algorithm=knn_algorithm, metric=metric).fit(X)
    distances, indices = nbrs.kneighbors(X)
    knn_radius = np.amax(distances, axis=1)
    sorted_knn_radius = knn_radius[::-1]
    filtering_radius = sorted_knn_radius[int(len(sorted_knn_radius)*alpha)-1]
    selected = knn_radius <= filtering_radius
    return X[selected], selected

 def trust_score(x_test, y_test, X, Y, alpha, k_knn, metric='euclidean'):
    """Measure of the aggreement between the classifier and a modified nearest
    neighborhood classifier on a test example x_test [1].
    
    Args:
        x_test: (1, n_features) array of the sample to test
        y_test: predicted label of the testing sample
        X: (n_samples, n_features) array of training samples, should not contain x_test
        Y: (n_samples,) array of predicted labels on training set
        alpha: density threshold
        k_knn: k for nearest neighbors to estimate 
        metric: distance metric used for knn
       
    Returns:
        float: Trust score
    
    References:
        [1] Jiang, H., Kim, B., & Gupta, M. (2018). 
        To Trust Or Not To Trust A Classifier. 
        arXiv preprint arXiv:1805.11783.
    """
    d_num = np.iinfo(np.int32).max
    # d_num = sp.spatial.distance.cdist(x, high_density_set(X[Y!=y,:], k_knn, alpha)[0]).min()
    for y_train in np.unique(Y):
        if y_train != y_test:
            # Filtering in each class then computing distance of the test sample to that class
            dist = sp.spatial.distance.cdist(x_test, high_density_set(X[Y==y_train,:], k_knn, alpha, metric=metric)[0], metric=metric).min()
            if d_num > dist:
                d_num = dist
    d_denom = sp.spatial.distance.cdist(x_test, high_density_set(X[Y==y_test,:], k_knn, alpha, metric=metric)[0], metric=metric).min()
    return d_num/d_denom
	"""
	The MIT License (MIT)
	Copyright (c) 2018 David Mugisha

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:
	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.
	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.
	"""

	#!/usr/bin/env python
	# -- coding: utf-8 --

	from sklearn.neighbors import NearestNeighbors
	import numpy as np
	import scipy as sp

	def high_density_set(X, k_knn, alpha, knn_algorithm='ball_tree', metric='euclidean'):
	"""Estimating high density set, to filter out alpha-fraction of the samples
	with lowest density(which might be outliers).
	The parameter alpha should be determined using cross-validation on a training set

	Args:
	X(np.ndarray): (n_samples, n_features) numpy array holding all the samples
	k_knn(int): number of nearest neighbors to use to estimate the high density set
	alpha(float): density threshold, i.e fraction of the samples to filter out
	knn_algorithm: string

	Returns:
	tuple: (
	np.ndarray: Samples kept
	np.ndarray: Array of boolean used to select the samples
	)

	References:
	[1] Jiang, H., Kim, B., & Gupta, M. (2018).
	To Trust Or Not To Trust A Classifier.
	arXiv preprint arXiv:1805.11783.
	"""
	nbrs = NearestNeighbors(n_neighbors=k_knn, algorithm=knn_algorithm, metric=metric).fit(X)
	distances, indices = nbrs.kneighbors(X)
	knn_radius = np.amax(distances, axis=1)
	sorted_knn_radius = knn_radius[::-1]
	filtering_radius = sorted_knn_radius[int(len(sorted_knn_radius)*alpha)-1]
	selected = knn_radius <= filtering_radius
	return X[selected], selected

	def trust_score(x_test, y_test, X, Y, alpha, k_knn, metric='euclidean'):
	"""Measure of the aggreement between the classifier and a modified nearest
	neighborhood classifier on a test example x_test [1].

	Args:
	x_test: (1, n_features) array of the sample to test
	y_test: predicted label of the testing sample
	X: (n_samples, n_features) array of training samples, should not contain x_test
	Y: (n_samples,) array of predicted labels on training set
	alpha: density threshold
	k_knn: k for nearest neighbors to estimate
	metric: distance metric used for knn

	Returns:
	float: Trust score

	References:
	[1] Jiang, H., Kim, B., & Gupta, M. (2018).
	To Trust Or Not To Trust A Classifier.
	arXiv preprint arXiv:1805.11783.
	"""
	d_num = np.iinfo(np.int32).max
	# d_num = sp.spatial.distance.cdist(x, high_density_set(X[Y!=y,:], k_knn, alpha)[0]).min()
	for y_train in np.unique(Y):
	if y_train != y_test:
	# Filtering in each class then computing distance of the test sample to that class
	dist = sp.spatial.distance.cdist(x_test, high_density_set(X[Y==y_train,:], k_knn, alpha, metric=metric)[0], metric=metric).min()
	if d_num > dist:
	d_num = dist
	d_denom = sp.spatial.distance.cdist(x_test, high_density_set(X[Y==y_test,:], k_knn, alpha, metric=metric)[0], metric=metric).min()
	return d_num/d_denom