Skip to content

Instantly share code, notes, and snippets.

@stes
Created December 30, 2016 13:30
Show Gist options
  • Save stes/92db6023aa3dab5d13e49ece198102c7 to your computer and use it in GitHub Desktop.
Save stes/92db6023aa3dab5d13e49ece198102c7 to your computer and use it in GitHub Desktop.
sklearn Clustering Pipeline using PCA, TSNE Embedding and KMeans Clustering
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from collections import OrderedDict
def cluster(X, pca_components=100, min_explained_variance=0.5, tsne_dimensions=2, nb_centroids=[4, 8, 16],\
X_=None, embedding=None):
""" Simple K-Means Clustering Pipeline for high dimensional data
Perform the following steps for robust clustering:
- Zero mean, unit variance normalization over all feature dimensions
- PCA transform to ``pca_components``
- T-SNE transform of PCA output to ``tsne_dimensions``
- K-Means Clustering to ``nb_centroids``
Parameters
----------
pca_components : int
Number of dimensions for principal component analysis
min_explaned_variance : float, betweeen 0...1
Ratio of variance that has to be explained by the PCA. Otherwise, abort with an error
tsne_dimensions : int
Dimensions for T-SNE algorithm
nb_centroids : list of int
Number of centroids for the KMeans algorithm
X_ : ndarray [optional]
PCA-transformed dataset, skip PCA if this is not None
embedding : ndarray [optional]
TSNE-transformed embedding, skip TSNE if this is not None
"""
if X_ is None:
pca = PCA(n_components=pca_components)
X_ = pca.fit_transform((X - X.mean(axis=0,keepdims=True)) / X.std(axis=0,keepdims=True))
print("PCA can explain {:.2f}% of the variance".format(100*pca.explained_variance_ratio_.sum()))
assert pca.explained_variance_ratio_.sum() > min_explained_variance
if embedding is None:
tsne = TSNE(n_components=tsne_dimensions)
embedding = tsne.fit_transform(X_)
y = []
#centroids = []
for nb in nb_centroids:
km = KMeans(n_clusters=nb)
km.fit(embedding)
y.append(km.predict(embedding))
#centroids.append(km.cluster_centers_)
return X_, embedding, np.stack(y, axis=1)
def clusterplot(x, y, classes):
figure(figsize=(10,10))
scatter(x,y, c = classes)
axis("off")
def save(fname, X_, embedding, y):
results = OrderedDict()
results["tsne_x"] = embedding[:,0]
results["tsne_y"] = embedding[:,1]
for i in range(pca_f.shape[1]):
results["pca_" + str(i)] = pca_f[:,i]
results["cluster_4"] = y[:,0]
results["cluster_8"] = y[:,1]
results["cluster_16"] = y[:,2]
pd.DataFrame(tsne_results).to_csv(fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment