Created
December 18, 2015 05:50
-
-
Save vvksh/b3051d19f10df08adef4 to your computer and use it in GitHub Desktop.
Implementation of K-Nearest neighbor classification algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class KNN(Classifier): | |
def __init__(self, dataObj=None, headers=[], categories=None, K=None): | |
'''Take in a Data object with N points, a set of F headers, and a | |
matrix of categories, with one category label for each data point.''' | |
# call the parent init with the type | |
Classifier.__init__(self, 'KNN Classifier') | |
# store the headers used for classification | |
self.headers = headers | |
# number of classes and number of features | |
self.num_classes = 0 | |
self.num_features = 0 | |
# original class labels | |
self.class_labels = [] | |
# unique data for the KNN classifier: list of exemplars (matrices) | |
self.exemplars = [] | |
if dataObj != None: | |
A = dataObj.get_data(headers) | |
self.build(A, categories, K) | |
def build(self, A, categories, K=None): | |
'''Builds the classifier give the data points in A and the categories''' | |
# figure out how many categories there are and get the mapping (np.unique) | |
unique, mapping = np.unique(np.array(categories.T), return_inverse=True) | |
# print "unique","mapping", unique, mapping | |
self.num_classes = len(unique) | |
self.num_features = A.shape[1] | |
self.class_labels = unique | |
for i in range(self.num_classes): | |
exemplar = A[(mapping == i), :] | |
if K == None: | |
# append to exemplars a matrix with all of the rows of A where the category/mapping is i | |
# print exemplar.shape, "exemplar.shape", i, "ith exemplar" | |
self.exemplars.append(exemplar) | |
elif K != None and isinstance(K, int): # check if K is an integer | |
# W = vq.whiten(exemplar) | |
codebook = an.kmeans_init(exemplar, K) | |
(codebook, codes, errors) = an.kmeans_algorithm(exemplar, codebook) # run K-means on the rows of A where the category/mapping is i | |
# print codebook.shape, "codebook", "ith codebook", i, K, "cluster" | |
self.exemplars.append(codebook) # append the codebook to the exemplars | |
self.write("KNN_classifier.csv") | |
return | |
def classify(self, A, K=5, return_distances=False): | |
'''Classify each row of A into one category. Return a matrix of | |
category IDs in the range [0..C-1], and an array of class | |
labels using the original label values. If return_distances is | |
True, it also returns the NxC distance matrix. | |
The parameter K specifies how many neighbors to use in the | |
distance computation. The default is three.''' | |
# print A, "A", self.exemplars, "exemplars" | |
# error check to see if A has the same number of columns as the class means | |
if A.shape[1] != self.num_features: | |
print A.shape[1], self.num_features | |
print "check data dimensions" | |
return | |
# make a matrix that is N x C to store the distance to each class for each data point | |
D = np.matlib.zeros((A.shape[0], self.num_classes)) # a matrix of zeros that is N (rows of A) x C (number of classes) | |
for k in range(self.num_classes): | |
# make a temporary matrix that is N x M where M is the number of exemplars (rows in exemplars[i]) | |
temp = np.matrix(scipy.spatial.distance.cdist(A,self.exemplars[k],'euclidean')) | |
temp = np.sort(temp,axis =1) | |
# print temp, "temp sorted" | |
D[:,k] = np.sum(temp[:,:K], axis =1) # sum the first K columns # this is the distance to the first class | |
# calculate the most likely class for each data point | |
cats = np.argmin(D,axis=1) # take the argmin of D along axis 1 | |
# use the class ID as a lookup to generate the original labels | |
labels = self.class_labels[cats] | |
if return_distances: | |
return cats, labels, D | |
return cats, labels | |
def __str__(self): | |
'''Make a pretty string that prints out the classifier information.''' | |
s = "\nKNN Classifier\n" | |
for i in range(self.num_classes): | |
s += 'Class %d --------------------\n' % (i) | |
s += 'Number of Exemplars: %d\n' % (self.exemplars[i].shape[0]) | |
s += 'Mean of Exemplars :' + str(np.mean(self.exemplars[i], axis=0)) + "\n" | |
s += "\n" | |
return s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It is an implementation of KNN Algorithm for data classification. Suppose you have two types of data(say, cats and dogs) and you have plotted them based on skin color and body weight. You have a new data with a certain skin color and body mass and you are asked to classify it as a dog or a cat. Using this algorithm, you can plot the new test data along with the known values and find , say 5, nearest values and see if there are more cats or more dogs in those 5 closest values. The above implementation works for huge sets of data which have multiple features.
I wrote the implementation myself.