Created
December 30, 2016 13:30
-
-
Save stes/92db6023aa3dab5d13e49ece198102c7 to your computer and use it in GitHub Desktop.
sklearn Clustering Pipeline using PCA, TSNE Embedding and KMeans Clustering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.manifold import TSNE | |
from sklearn.decomposition import PCA | |
from collections import OrderedDict | |
def cluster(X, pca_components=100, min_explained_variance=0.5, tsne_dimensions=2, nb_centroids=[4, 8, 16],\ | |
X_=None, embedding=None): | |
""" Simple K-Means Clustering Pipeline for high dimensional data | |
Perform the following steps for robust clustering: | |
- Zero mean, unit variance normalization over all feature dimensions | |
- PCA transform to ``pca_components`` | |
- T-SNE transform of PCA output to ``tsne_dimensions`` | |
- K-Means Clustering to ``nb_centroids`` | |
Parameters | |
---------- | |
pca_components : int | |
Number of dimensions for principal component analysis | |
min_explaned_variance : float, betweeen 0...1 | |
Ratio of variance that has to be explained by the PCA. Otherwise, abort with an error | |
tsne_dimensions : int | |
Dimensions for T-SNE algorithm | |
nb_centroids : list of int | |
Number of centroids for the KMeans algorithm | |
X_ : ndarray [optional] | |
PCA-transformed dataset, skip PCA if this is not None | |
embedding : ndarray [optional] | |
TSNE-transformed embedding, skip TSNE if this is not None | |
""" | |
if X_ is None: | |
pca = PCA(n_components=pca_components) | |
X_ = pca.fit_transform((X - X.mean(axis=0,keepdims=True)) / X.std(axis=0,keepdims=True)) | |
print("PCA can explain {:.2f}% of the variance".format(100*pca.explained_variance_ratio_.sum())) | |
assert pca.explained_variance_ratio_.sum() > min_explained_variance | |
if embedding is None: | |
tsne = TSNE(n_components=tsne_dimensions) | |
embedding = tsne.fit_transform(X_) | |
y = [] | |
#centroids = [] | |
for nb in nb_centroids: | |
km = KMeans(n_clusters=nb) | |
km.fit(embedding) | |
y.append(km.predict(embedding)) | |
#centroids.append(km.cluster_centers_) | |
return X_, embedding, np.stack(y, axis=1) | |
def clusterplot(x, y, classes): | |
figure(figsize=(10,10)) | |
scatter(x,y, c = classes) | |
axis("off") | |
def save(fname, X_, embedding, y): | |
results = OrderedDict() | |
results["tsne_x"] = embedding[:,0] | |
results["tsne_y"] = embedding[:,1] | |
for i in range(pca_f.shape[1]): | |
results["pca_" + str(i)] = pca_f[:,i] | |
results["cluster_4"] = y[:,0] | |
results["cluster_8"] = y[:,1] | |
results["cluster_16"] = y[:,2] | |
pd.DataFrame(tsne_results).to_csv(fname) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment