Created
June 23, 2017 16:05
-
-
Save gpedote/68a6678f50fd9f07ac3889ff0bd57102 to your computer and use it in GitHub Desktop.
Dunn index for sklearn-generated clusters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def dunn(c, distances): | |
""" | |
Dunn index for cluster validation (the bigger, the better) | |
.. math:: D = \\min_{i = 1 \\ldots n_c; j = i + 1\ldots n_c} \\left\\lbrace \\frac{d \\left( c_i,c_j \\right)}{\\max_{k = 1 \\ldots n_c} \\left(diam \\left(c_k \\right) \\right)} \\right\\rbrace | |
where :math:`d(c_i,c_j)` represents the distance between | |
clusters :math:`c_i` and :math:`c_j`, given by the distances between its | |
two closest data points, and :math:`diam(c_k)` is the diameter of cluster | |
:math:`c_k`, given by the distance between its two farthest data points. | |
The bigger the value of the resulting Dunn index, the better the clustering | |
result is considered, since higher values indicate that clusters are | |
compact (small :math:`diam(c_k)`) and far apart. | |
.. [Kovacs2005] Kovács, F., Legány, C., & Babos, A. (2005). Cluster validity measurement techniques. 6th International Symposium of Hungarian Researchers on Computational Intelligence. | |
""" | |
unique_cluster_distances = np.unique(min_cluster_distances(c, distances)) | |
max_diameter = max(diameter(c, distances)) | |
if np.size(unique_cluster_distances) > 1: | |
return unique_cluster_distances[1] / max_diameter | |
else: | |
return unique_cluster_distances[0] / max_diameter | |
def min_cluster_distances(c, distances): | |
"""Calculates the distances between the two nearest points of each cluster""" | |
min_distances = np.zeros((max(c) + 1, max(c) + 1)) | |
for i in np.arange(0, len(c)): | |
if c[i] == -1: continue | |
for ii in np.arange(i + 1, len(c)): | |
if c[ii] == -1: continue | |
if c[i] != c[ii] and distances[i, ii] > min_distances[c[i], c[ii]]: | |
min_distances[c[i], c[ii]] = min_distances[c[ii], c[i]] = distances[i, ii] | |
return min_distances | |
def diameter(c, distances): | |
"""Calculates cluster diameters (the distance between the two farthest data points in a cluster)""" | |
diameters = np.zeros(max(c) + 1) | |
for i in np.arange(0, len(c)): | |
if c[i] == -1: continue | |
for ii in np.arange(i + 1, len(c)): | |
if c[ii] == -1: continue | |
if c[i] != -1 or c[ii] != -1 and c[i] == c[ii] and distances[i, ii] > diameters[c[i]]: | |
diameters[c[i]] = distances[i, ii] | |
return diameters |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment