helderc · September 24, 2018 13:07
diff --git a/kmeans_features.py b/kmeans_features.py
 """Experiment with soft-thresholded k-means feature for MNIST classification

 This is experiment is a tentative alternative to approximate kernel expansions
 explored on the same dataset by @amueller on this blog post:

 http://peekaboo-vision.blogspot.fr/2012/12/kernel-approximations-for-efficient.html

 Meant to be run with ``%run script.py`` in IPython.

 The 1000-dim k-means based feature expansion should yield ~96% test accuracy
 when trained on 20k samples in less than 20s (unsupervised feature extraction
 + classifier training).

 The baseline linear model is accuracy 91% on the same dataset.

 """

 # Author: [email protected]
 # License: Simplified BSD
 from time import time
 import numpy as np
 from sklearn.datasets import fetch_mldata
 from sklearn.cross_validation import cross_val_score
 from sklearn.cross_validation import train_test_split

 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.preprocessing import normalize
 from sklearn.decomposition import RandomizedPCA
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.svm import LinearSVC

 mnist = fetch_mldata('MNIST original')

 # Load 30k samples in the dev set as we will use 3-folds CV, hence 20k samples
 # for each training set.
 X_dev, X_test, y_dev, y_test = train_test_split(
    mnist.data.astype(np.float32), mnist.target, train_size=30000,
    random_state=1)

 scaler = MinMaxScaler()
 X_scaled_dev = scaler.fit_transform(X_dev)
 X_scaled_test = scaler.transform(X_test)

 print("n_samples=%d, n_features=%d" % X_dev.shape)
 print("n_classes=%d" % np.unique(y_dev).shape[0])


 class MiniBatchKMeansMapper(MiniBatchKMeans):
    """Soft thresholding cosine transfomer k-means

    This is some kind poors man, non linear sparse coded feature mapping.

    """

    def _transform(self, X):
        # Compute cosine similarities of samples w.r.t. k-means centers
        # TODO: optim normalize the centers ones and for all
        c = normalize(self.cluster_centers_)
        X = normalize(X)
        sims = np.dot(X, c.T)

        # Remove the negative cosine features (~%50% of them)
        # TODO: make it possible to use a percentile or an absolute parameter
        # in range (-1, 1) to be cross-validated
        sims[sims < 0.0] = 0.0

        # Project the new features on the unit euclidean ball because it
        # seems reasonable...
        # TODO: make normalization optional to be cross validated
        return normalize(sims, copy=True)


 mapper = MiniBatchKMeansMapper(
            n_clusters=1000, n_init=1, init='random', batch_size=1000,
            init_size=3000, random_state=1, verbose=0,
            compute_labels=False)

 models = [
    LinearSVC(C=0.01, random_state=1),
    Pipeline([
        # Reduce dimensionality to make K-Means converge faster
        ('dim_reduction', RandomizedPCA(50, whiten=True, random_state=1)),

        # Non linear feature extraction akin to an approximate kernel
        # expansion
        ('feature_map', mapper),

        # Linear classification
        ('svm', LinearSVC(C=1, random_state=1)),
    ]),
 ]

 def bench(model, X, y, cv=3):
    print("Computing %d-CV for %r..." % (cv, model))
    t0 = time()
    scores = cross_val_score(model, X, y, cv=cv, verbose=1, n_jobs=1)
    time_linear = time() - t0
    print("score: %0.3f +/- %0.3f" % (np.mean(scores), np.std(scores)))

    # compute duration for 1 fold, assuming n_jobs=1
    duration = time_linear / scores.shape[0]
    print("duration: %0.3fs" % duration)
    return np.mean(scores), duration

 results = [bench(m, X_scaled_dev, y_dev) for m in models]
	"""Experiment with soft-thresholded k-means feature for MNIST classification

	This is experiment is a tentative alternative to approximate kernel expansions
	explored on the same dataset by @amueller on this blog post:

	http://peekaboo-vision.blogspot.fr/2012/12/kernel-approximations-for-efficient.html

	Meant to be run with ``%run script.py`` in IPython.

	The 1000-dim k-means based feature expansion should yield ~96% test accuracy
	when trained on 20k samples in less than 20s (unsupervised feature extraction
	+ classifier training).

	The baseline linear model is accuracy 91% on the same dataset.

	"""

	# Author: [email protected]
	# License: Simplified BSD
	from time import time
	import numpy as np
	from sklearn.datasets import fetch_mldata
	from sklearn.cross_validation import cross_val_score
	from sklearn.cross_validation import train_test_split

	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.preprocessing import normalize
	from sklearn.decomposition import RandomizedPCA
	from sklearn.cluster import MiniBatchKMeans
	from sklearn.svm import LinearSVC

	mnist = fetch_mldata('MNIST original')

	# Load 30k samples in the dev set as we will use 3-folds CV, hence 20k samples
	# for each training set.
	X_dev, X_test, y_dev, y_test = train_test_split(
	mnist.data.astype(np.float32), mnist.target, train_size=30000,
	random_state=1)

	scaler = MinMaxScaler()
	X_scaled_dev = scaler.fit_transform(X_dev)
	X_scaled_test = scaler.transform(X_test)

	print("n_samples=%d, n_features=%d" % X_dev.shape)
	print("n_classes=%d" % np.unique(y_dev).shape[0])


	class MiniBatchKMeansMapper(MiniBatchKMeans):
	"""Soft thresholding cosine transfomer k-means

	This is some kind poors man, non linear sparse coded feature mapping.

	"""

	def _transform(self, X):
	# Compute cosine similarities of samples w.r.t. k-means centers
	# TODO: optim normalize the centers ones and for all
	c = normalize(self.cluster_centers_)
	X = normalize(X)
	sims = np.dot(X, c.T)

	# Remove the negative cosine features (~%50% of them)
	# TODO: make it possible to use a percentile or an absolute parameter
	# in range (-1, 1) to be cross-validated
	sims[sims < 0.0] = 0.0

	# Project the new features on the unit euclidean ball because it
	# seems reasonable...
	# TODO: make normalization optional to be cross validated
	return normalize(sims, copy=True)


	mapper = MiniBatchKMeansMapper(
	n_clusters=1000, n_init=1, init='random', batch_size=1000,
	init_size=3000, random_state=1, verbose=0,
	compute_labels=False)

	models = [
	LinearSVC(C=0.01, random_state=1),
	Pipeline([
	# Reduce dimensionality to make K-Means converge faster
	('dim_reduction', RandomizedPCA(50, whiten=True, random_state=1)),

	# Non linear feature extraction akin to an approximate kernel
	# expansion
	('feature_map', mapper),

	# Linear classification
	('svm', LinearSVC(C=1, random_state=1)),
	]),
	]

	def bench(model, X, y, cv=3):
	print("Computing %d-CV for %r..." % (cv, model))
	t0 = time()
	scores = cross_val_score(model, X, y, cv=cv, verbose=1, n_jobs=1)
	time_linear = time() - t0
	print("score: %0.3f +/- %0.3f" % (np.mean(scores), np.std(scores)))

	# compute duration for 1 fold, assuming n_jobs=1
	duration = time_linear / scores.shape[0]
	print("duration: %0.3fs" % duration)
	return np.mean(scores), duration

	results = [bench(m, X_scaled_dev, y_dev) for m in models]