Created
September 25, 2011 18:33
-
-
Save GaelVaroquaux/1240940 to your computer and use it in GitHub Desktop.
With a random shapeless affinity matrix, spectral clustering does not work: the spectrum of the laplacian is flat.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# gvaroquaux (adapted from tdh.net gmail com) | |
# 31.August.2011 | |
# try clustering module in scikits.learn | |
import numpy as np | |
from scipy import linalg | |
from sklearn.metrics.pairwise import euclidean_distances | |
from sklearn.cluster import SpectralClustering | |
############################################################################ | |
def print_clusters(labels, k): | |
for i in xrange(k): | |
print 'cluster %d: ' % i, np.nonzero(labels==i)[0] | |
############################################################################ | |
def sample_data(num, k=2, dstyle='uniform', sigma=5): | |
'''A class to generate sample data | |
''' | |
# num: the number of 2D data point | |
# dstyle: data distribution style | |
# 'uniform' -- uniformly distributed | |
# 'gaussian' -- gaussian distributed | |
# in gaussian distribution, k is the number of Gaussian | |
num = num | |
k = k | |
xmax = 20 | |
ymax = 20 | |
# randomly generate data | |
data = np.zeros((num, 2)) | |
if dstyle == 'uniform': | |
# uniform distribution | |
data[:, 0] = np.random.rand(num)*xmax | |
data[:, 1] = np.random.rand(num)*ymax | |
elif dstyle == 'gaussian': | |
# Gaussian distribution | |
# the centers of gaussian | |
seeds = np.random.rand(k, 2)*xmax | |
ss = 0 | |
step = num/k + 1 | |
for i in xrange(k): | |
# generate cluster "i": from ss to end | |
end = min(ss + step, num) | |
# generate x, y separately | |
data[ss:end, 0] = np.random.normal(seeds[i, 0], sigma, | |
end-ss) | |
data[ss:end, 1] = np.random.normal(seeds[i, 1], sigma, | |
end-ss) | |
ss = end | |
# compute the affinity matrix: distance between every pair of points | |
afmat = euclidean_distances(data) | |
# format the affinity matrix to correct form: | |
afmat = np.exp(-afmat**2/(afmat.std()**2)) + 1e-6 | |
return afmat | |
############################################################################## | |
if __name__ == '__main__': | |
num = 400 | |
k = 2 | |
dstype = 'uniform' #'gaussian' | |
sigma = 5.0 | |
sdata = sample_data(num, k, dstype, sigma) | |
sp_clt = SpectralClustering(k=k, mode='amg') | |
sp_clt.fit(sdata) | |
# now, visualize | |
print_clusters(sp_clt.labels_, k) | |
from sklearn.utils.graph import graph_laplacian | |
laplacian = graph_laplacian(sdata, normed=True) | |
print 'Laplacian spetrum: %s' % linalg.eigvalsh(laplacian) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment