bcambel · January 26, 2015 13:08
diff --git a/cluster.py b/cluster.py
 """
 =========================================================
 Comparing different clustering algorithms on toy datasets
 =========================================================

 This example aims at showing characteristics of different
 clustering algorithms on datasets that are "interesting"
 but still in 2D. The last dataset is an example of a 'null'
 situation for clustering: the data is homogeneous, and
 there is no good clustering.

 While these examples give some intuition about the algorithms,
 this intuition might not apply to very high dimensional data.

 The results could be improved by tweaking the parameters for
 each clustering strategy, for instance setting the number of
 clusters for the methods that needs this parameter
 specified. Note that affinity propagation has a tendency to
 create many clusters. Thus in this example its two parameters
 (damping and per-point preference) were set to to mitigate this
 behavior.
 """
 print(__doc__)

 import time

 import numpy as np
 import matplotlib.pyplot as plt

 from sklearn import cluster, datasets
 from sklearn.metrics import euclidean_distances
 from sklearn.neighbors import kneighbors_graph
 from sklearn.preprocessing import StandardScaler

 np.random.seed(0)

 # Generate datasets. We choose the size big enough to see the scalability
 # of the algorithms, but not too big to avoid too long running times
 n_samples = 1500
 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
 noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
 blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
 no_structure = np.random.rand(n_samples, 2), None

 colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
 colors = np.hstack([colors] * 20)

 plt.figure(figsize=(17, 9.5))
 plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05,
                    hspace=.01)

 plot_num = 1
 for i_dataset, dataset in enumerate([noisy_circles, noisy_moons, blobs,
                                     no_structure]):
    X, y = dataset
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # Compute distances
    #distances = np.exp(-euclidean_distances(X))
    distances = euclidean_distances(X)

    # create clustering estimators
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=2)
    ward = cluster.AgglomerativeClustering(n_clusters=2,
                    linkage='ward', connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=.2)
    affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                       preference=-200)

    average_linkage = cluster.AgglomerativeClustering(linkage="average",
                            affinity="cityblock", n_clusters=2,
                            connectivity=connectivity)

    for name, algorithm in [
                            ('MiniBatchKMeans', two_means),
                            ('AffinityPropagation', affinity_propagation),
                            ('MeanShift', ms),
                            ('SpectralClustering', spectral),
                            ('Ward', ward),
                            ('AgglomerativeClustering', average_linkage),
                            ('DBSCAN', dbscan)
                           ]:
        # predict cluster memberships
        t0 = time.time()
        algorithm.fit(X)
        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)

        # plot
        plt.subplot(4, 7, plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)
        plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

        if hasattr(algorithm, 'cluster_centers_'):
            centers = algorithm.cluster_centers_
            center_colors = colors[:len(centers)]
            plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
        plt.xlim(-2, 2)
        plt.ylim(-2, 2)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1

 plt.show()
	"""
	=========================================================
	Comparing different clustering algorithms on toy datasets
	=========================================================

	This example aims at showing characteristics of different
	clustering algorithms on datasets that are "interesting"
	but still in 2D. The last dataset is an example of a 'null'
	situation for clustering: the data is homogeneous, and
	there is no good clustering.

	While these examples give some intuition about the algorithms,
	this intuition might not apply to very high dimensional data.

	The results could be improved by tweaking the parameters for
	each clustering strategy, for instance setting the number of
	clusters for the methods that needs this parameter
	specified. Note that affinity propagation has a tendency to
	create many clusters. Thus in this example its two parameters
	(damping and per-point preference) were set to to mitigate this
	behavior.
	"""
	print(__doc__)

	import time

	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn import cluster, datasets
	from sklearn.metrics import euclidean_distances
	from sklearn.neighbors import kneighbors_graph
	from sklearn.preprocessing import StandardScaler

	np.random.seed(0)

	# Generate datasets. We choose the size big enough to see the scalability
	# of the algorithms, but not too big to avoid too long running times
	n_samples = 1500
	noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
	noise=.05)
	noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
	blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
	no_structure = np.random.rand(n_samples, 2), None

	colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
	colors = np.hstack([colors] * 20)

	plt.figure(figsize=(17, 9.5))
	plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05,
	hspace=.01)

	plot_num = 1
	for i_dataset, dataset in enumerate([noisy_circles, noisy_moons, blobs,
	no_structure]):
	X, y = dataset
	# normalize dataset for easier parameter selection
	X = StandardScaler().fit_transform(X)

	# estimate bandwidth for mean shift
	bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

	# connectivity matrix for structured Ward
	connectivity = kneighbors_graph(X, n_neighbors=10)
	# make connectivity symmetric
	connectivity = 0.5 * (connectivity + connectivity.T)

	# Compute distances
	#distances = np.exp(-euclidean_distances(X))
	distances = euclidean_distances(X)

	# create clustering estimators
	ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
	two_means = cluster.MiniBatchKMeans(n_clusters=2)
	ward = cluster.AgglomerativeClustering(n_clusters=2,
	linkage='ward', connectivity=connectivity)
	spectral = cluster.SpectralClustering(n_clusters=2,
	eigen_solver='arpack',
	affinity="nearest_neighbors")
	dbscan = cluster.DBSCAN(eps=.2)
	affinity_propagation = cluster.AffinityPropagation(damping=.9,
	preference=-200)

	average_linkage = cluster.AgglomerativeClustering(linkage="average",
	affinity="cityblock", n_clusters=2,
	connectivity=connectivity)

	for name, algorithm in [
	('MiniBatchKMeans', two_means),
	('AffinityPropagation', affinity_propagation),
	('MeanShift', ms),
	('SpectralClustering', spectral),
	('Ward', ward),
	('AgglomerativeClustering', average_linkage),
	('DBSCAN', dbscan)
	]:
	# predict cluster memberships
	t0 = time.time()
	algorithm.fit(X)
	t1 = time.time()
	if hasattr(algorithm, 'labels_'):
	y_pred = algorithm.labels_.astype(np.int)
	else:
	y_pred = algorithm.predict(X)

	# plot
	plt.subplot(4, 7, plot_num)
	if i_dataset == 0:
	plt.title(name, size=18)
	plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

	if hasattr(algorithm, 'cluster_centers_'):
	centers = algorithm.cluster_centers_
	center_colors = colors[:len(centers)]
	plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
	plt.xlim(-2, 2)
	plt.ylim(-2, 2)
	plt.xticks(())
	plt.yticks(())
	plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
	transform=plt.gca().transAxes, size=15,
	horizontalalignment='right')
	plot_num += 1

	plt.show()
No results found