Skip to content

Instantly share code, notes, and snippets.

@mbrengel
Last active September 27, 2019 12:14
Show Gist options
  • Save mbrengel/af22619faee414fbc7d7803dfb095c98 to your computer and use it in GitHub Desktop.
Save mbrengel/af22619faee414fbc7d7803dfb095c98 to your computer and use it in GitHub Desktop.
Hierarchical Agglomerative Clustering
#!/usr/bin/env python3.7
from scipy.cluster import hierarchy
from matplotlib import pyplot as plt
import numpy as np
class ClusteringItem:
def __init__(self, label, features):
self.label = label
self.features = features
def __str__(self):
return str(self.label)
class Clustering:
def __init__(self, items):
self.items = items
self.clusters = set()
def linkage(self, dist, method="complete"):
l = len(self.items)
mat = np.zeros(int(l * (l - 1) / 2))
idx = 0
for i in range(l):
for j in range(i + 1, l):
mat[idx] = dist(self.items[i].features, self.items[j].features)
idx += 1
self.Z = hierarchy.linkage(mat, method=method)
def cluster(self, threshold, dendrogram=False):
labels = hierarchy.fcluster(self.Z, threshold, criterion="distance")
clusters = [[] for _ in range(max(labels))]
for item, idx in zip(self.items, labels):
clusters[idx-1].append(item)
self.clusters = sorted(clusters, key=len, reverse=True)
if dendrogram:
hierarchy.dendrogram(self.Z, color_threshold=threshold+np.finfo(float).eps, leaf_rotation=90.)
xlabels = [str(self.items[int(i.get_text())]) for i in plt.gca().get_xticklabels()]
plt.gca().set_xticklabels(xlabels)
plt.show()
#!/usr/bin/env python3.7
from clustering import ClusteringItem, Clustering
def jaccard(a, b):
if len(a | b) == 0:
return 0.0
lanb = float(len(a & b))
laub = len(a) + len(b) - lanb
return 1 - lanb / laub
items = [
ClusteringItem("a", {1, 2, 3}),
ClusteringItem("b", {1, 2}),
ClusteringItem("h", {7, 8, 9}),
ClusteringItem("i", {7, 8}),
ClusteringItem("z", {11, 12, 13})
]
C = Clustering(items)
C.linkage(jaccard)
C.cluster(0.4, dendrogram=True)
for cluster in C.clusters:
print(", ".join(map(str, cluster)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment