Created
June 15, 2015 14:43
-
-
Save StuartGordonReid/7841ab6837e7e84476f3 to your computer and use it in GitHub Desktop.
Clustering Objective Functions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ClusteringQuality: | |
""" | |
Instances of this class implement the two measures of clustering quality discussed in the article, namely the davies | |
bouldin index and the silhouette index. It also implements a number of useful helper methods. | |
:param solution: the clustering solution of type Clustering | |
:param minimum: the minimum distance allowable | |
""" | |
def __init__(self, solution, minimum): | |
""" | |
Initializes a ClusteringQuality object with a given Clustering solution and a minimum distance | |
:param solution: this is an object of type Clustering | |
:param minimum: this is the minimum distance allowed between two points | |
""" | |
assert isinstance(solution, Clustering) | |
self.solution = solution | |
self.e = minimum | |
def cluster_totals(self): | |
""" | |
This method calculates the total distance from every centroid to every pattern assigned to it. It also records | |
the number of patterns in each cluster which are used to compute average distances in cluster_averages() | |
:return: a two dimensional list of [total cluster distance, total patterns in cluster] for each centroid | |
""" | |
s = Similarity(self.e) | |
# create array (will be 2d) to store total internal cluster distances and cluster counts for each centroid | |
cluster_distances_counts = [] | |
for i in range(self.solution.num_clusters): | |
ith_cluster_count = 0.0 | |
ith_cluster_distance = 0.0 | |
for z in range(len(self.solution.solution)): | |
# update the count and the total distance for the centroid z[i] belongs to (whichever one that is) | |
if self.solution.solution[z] == i: | |
ith_cluster_count += 1 | |
ith_cluster_distance += s.fractional_distance(self.solution.patterns[z], self.solution.centroids[i]) | |
# add the result to the 2d list | |
cluster_distances_counts.append([ith_cluster_distance, max(ith_cluster_count, 1.0)]) | |
return np.array(cluster_distances_counts) | |
def cluster_averages(self): | |
""" | |
Receives output from cluster_totals() and computes the average distance per centroid | |
:return: average distance from each centroid to the patterns assigned to it | |
""" | |
# create list to store averages in | |
cluster_averages = [] | |
# get the total internal cluster distances plus the counts for each centroid / cluster | |
cluster_distances_counts = self.cluster_totals() | |
for i in range(len(cluster_distances_counts)): | |
# calculate the averages and add it to the list | |
cluster_averages.append(cluster_distances_counts[i][0] / cluster_distances_counts[i][1]) | |
return np.array(cluster_averages) | |
def davies_bouldin(self): | |
""" | |
This method computes the davies-bouldin (db) of a given clustering. | |
:return: the davies bouldin value of the clustering | |
""" | |
# get the average internal cluster distances | |
cluster_averages = self.cluster_averages() | |
# create variable for db | |
davies_bouldin = 0.0 | |
s = Similarity(self.e) | |
# for each cluster / centroid i | |
for i in range(self.solution.num_clusters): | |
# for each cluster / centroid j | |
for j in range(self.solution.num_clusters): | |
# when i and j are not the same cluster / centroid | |
if j != i: | |
# calculate the distance between the two centroids of i and j | |
d_ij = s.fractional_distance(self.solution.centroids[i], self.solution.centroids[j]) | |
# update the variable to equal to sum of internal cluster distances of clusters i and j divided by | |
# the previously computer value i.e. the distance between centroid i and centroid j | |
d_ij = (cluster_averages[i] + cluster_averages[j]) / d_ij | |
# update db is this is larger than any db seen before | |
davies_bouldin = max(d_ij, davies_bouldin) | |
return davies_bouldin | |
def silhouette_index(self, index): | |
""" | |
This method computes the silhouette index (si) for any given pattern between -1 and 1 | |
:param index: the pattern we are looking at now | |
:return: the silhouette index for that pattern | |
""" | |
# store the total distance to each cluster | |
silhouette_totals = [] | |
# store the number of patterns in each cluster | |
silhouette_counts = [] | |
# initialize the variables | |
for i in range(self.solution.num_clusters): | |
silhouette_totals.append(0.0) | |
silhouette_counts.append(0.0) | |
s = Similarity(self.e) | |
for i in range(len(self.solution.patterns)): | |
# for every pattern other than the one we are calculating now | |
if i != index: | |
# get the distance between pattern[index] and that pattern | |
distance = s.fractional_distance(self.solution.patterns[i], self.solution.patterns[index]) | |
# add that distance to the silhouette totals for the correct cluster | |
silhouette_totals[self.solution.solution[i]] += distance | |
# update the number of patterns in that cluster | |
silhouette_counts[self.solution.solution[i]] += 1 | |
# setup variable to find the cluster (not equal to the pattern[index]'s cluster) with the smallest distance | |
smallest_silhouette = silhouette_totals[0] / max(1.0, silhouette_counts[0]) | |
for i in range(len(silhouette_totals)): | |
# calculate the average distance of each pattern in that cluster from pattern[index] | |
silhouette = silhouette_totals[i] / max(1.0, silhouette_counts[i]) | |
# if the average distance is lower and it isn't pattern[index] cluster update the value | |
if silhouette < smallest_silhouette and i != self.solution.solution[index]: | |
smallest_silhouette = silhouette | |
# calculate the internal cluster distances for pattern[index] | |
index_cluster = self.solution.solution[index] | |
index_silhouette = self.e + silhouette_totals[index_cluster] / max(1.0, silhouette_counts[index_cluster]) | |
# return the ratio between the smallest distance from pattern[index] to another cluster's patterns and | |
# the patterns belong to the same cluster as pattern[index] | |
return (smallest_silhouette - index_silhouette) / max(smallest_silhouette, index_silhouette) | |
def silhouette_index_zero_one(self, index): | |
""" | |
Returns the silhouette index between 0 and 1 and makes it a minimization objective (easier) | |
:param index: the pattern we are looking at now | |
:return: the silhouette index for that pattern | |
""" | |
return 1 - ((1 + self.silhouette_index(index)) / 2.0) | |
def average_silhouette_index(self, scaled_zero_one=True): | |
""" | |
This method computes the average silhouette index value every pattern in the data set. | |
:param scaled_zero_one: allows you to scale the result between 0 and 1 and reverse the order | |
:return: the silhouette index of the given clustering | |
""" | |
silhouette_sum = 0.0 | |
for i in range(len(self.solution.patterns)): | |
if scaled_zero_one: | |
silhouette_sum += self.silhouette_index_zero_one(i) | |
else: | |
silhouette_sum += self.silhouette_index(i) | |
return silhouette_sum / len(self.solution.patterns) | |
def quantization_error(self): | |
""" | |
This method calculates the quantization error of the given clustering | |
:return: the quantization error | |
""" | |
total_distance = 0.0 | |
s = Similarity(self.e) | |
for i in range(len(self.solution.patterns)): | |
total_distance += math.pow(s.fractional_distance(self.solution.patterns[i], | |
self.solution.centroids[self.solution.solution[i]]), 2.0) | |
return total_distance / len(self.solution.patterns) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thx for the code 👍 , but i think it's will be awesome if you can give example :)