Created
January 21, 2014 17:21
-
-
Save denis-bz/8544202 to your computer and use it in GitHub Desktop.
nearsame.py: an easy and intuitive way to find outliers in data 2d to 20d, with kd-trees
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" nearsame.py: an easy and intuitive way to find outliers in data 2d to 20d. | |
-> e.g. | |
[50 24 15 10] % of the 38356 points have | |
[ 3 2 1 0] of the 3 nearest neighbors in the same class. | |
Here one could keep the 74 % for which the nearest 3 neighbors | |
are mostly in the same class, and call the rest outliers. | |
Intuitively, points in mixed neighborhoods, where classes overlap, | |
are harder to classify. (For "classes" think colors, | |
e.g. a Go board: black white or empty.) | |
A possible flow, using kd-trees: | |
1) build a tree from the reference or training data | |
2) trim outliers with `nearsame`, e.g. | |
keep = (ndiff <= 1) | |
X = X[keep] | |
y = y[keep] | |
3) build a new trimmed tree, and use it for classifying new data points | |
by majority vote of say 3 or 5 nearest neighbors. | |
If there's no clear majority, return "not sure". | |
(We have to build a new tree because most kd-tree implementation | |
cannot delete points easily.) | |
How coordinates are scaled -- the metric for "nearby" -- | |
is of course important. Try normalizing all rows of X to 1, | |
i.e. cos metric; then |x - y|^2 = 2 - 2 x . y | |
See also: | |
Hastie et al., Elements of Stat Learning p. 118: 11 vowels, 90 samples each | |
heed hid head had hard hud hod hoard hood who'd heard | |
keywords: multiclass high-dimensional outlier kd-tree | |
""" | |
# https://www.google.com/search?as_q=high-dimensional+outlier | |
# http://scholar.google.de/scholar?as_q=high-dimensional+outlier&as_occt=title&as_ylo=2010 | |
from __future__ import division | |
import numpy as np | |
from scipy.spatial import cKDTree as KDTree | |
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.html | |
__version__ = "2014-01-21 Jan denis" | |
#............................................................................... | |
def nearsame( X, y, nnear=3, leafsize=10, p=2, verbose=1 ): | |
""" in: X: ndata x ndim reals, ndim 2 .. 20 | |
y: class numbers, 0 1 2 ... for e.g. red green blue ... | |
out: ndiff, ix | |
ndiff: ndata int8s, the number of neighbors in a different class | |
0 all same .. nnear all diff | |
ix: ndata x nnear+1 indices 0 .. ndata, from kdtree | |
""" | |
X = np.asanyarray( X ) | |
y = np.asanyarray( y ) | |
ndata, ndim = X.shape | |
assert y.dtype.kind == "i", "y must be int, not %s" % y.dtype | |
if ndim > 20: | |
print "warning: nearsame %s may be slow, try KDTree cutoff" % str(X.shape) | |
# flann: 128d | |
tree = KDTree( X, leafsize=leafsize ) # build the tree | |
distances, ix = tree.query( X, k=nnear+1, p=p ) | |
del distances # idw nominals ? | |
assert np.all( ix[:,0] == np.arange( ndata )) # ? | |
nearclasses = y[ix[:,1:]] # of nnear nearest neighbors | |
ndiff = (y != nearclasses.T) .sum(axis=0) .astype(np.int8) # 0 .. nnear | |
if verbose: | |
counts = np.bincount(ndiff) | |
percents = (counts * 100 / counts.sum()) .round().astype(int) | |
print """ | |
%s %% of the %s points have | |
%s of the %d nearest neighbors in the same class. """ % ( | |
percents, X.shape, np.arange( nnear, -1, -1 ), nnear ) | |
print "# av ndiff of each class, high => harder to classify:" # v roughly | |
for j in range( y.max() + 1 ): | |
ndiffj = ndiff[ y == j ] | |
if len(ndiffj) > 0: | |
print "%d %.2g " % (j, ndiffj.mean()) | |
print "\n" | |
return ndiff, ix | |
def subset_arrays( ndiff, adict, near=1 ): | |
""" X = X[ndiff <= near] for X in adict / np.load """ | |
if isinstance( ndiff, basestring ): | |
ndiff = np.loadtxt( ndiff, dtype=np.int8 ) | |
Jnear = (ndiff <= near) .nonzero()[0] | |
print "subset_arrays: %d of %d are <= %d:" % ( | |
len(Jnear), len(ndiff), near) , | |
for k, v in sorted( adict.items() ): | |
if getattr( v, "shape", None ) and len(v) == len(ndiff): | |
# np.array( 3 "str" [] ... ) in np.load | |
adict[k] = v[Jnear] | |
print k , | |
print "" | |
adict["ndiff"] = ndiff | |
#............................................................................... | |
if __name__ == "__main__": | |
import sys | |
from bz.etc import dataxy | |
source = "statlearn/vowel*" # Hastie, Elements of Stat Learning p. 118 | |
centre = 4 # 2: -= mean /= sd, 4: row norms 1 | |
nnear = 3 | |
p = 2 # inf faster than 2 | |
# run this.py a=1 b=None c=[3] 'd = expr' ... in sh or ipython | |
exec( "\n".join( sys.argv[1:] )) | |
np.set_printoptions( 1, threshold=20, edgeitems=10, linewidth=200, suppress=True ) | |
#............................................................................... | |
bag = dataxy.dataxy( source, centre=centre ) | |
X, y = bag.X, bag.y | |
ndiff, ix = nearsame( X, y, nnear=nnear, p=p ) | |
# vowels av ndiff of each class: | |
# 1 .033 2 .089 3 .14 4 .14 5 .39 6 .52 7 .49 8 .14 9 .43 10 .089 11 .2 | |
# heed hid head had hard hud hod hoard hood who'd heard | |
subset_arrays( ndiff, bag ) # 936 of 990 are <= 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment