Skip to content

Instantly share code, notes, and snippets.

@kastnerkyle
Last active December 31, 2015 06:29
Show Gist options
  • Save kastnerkyle/7948151 to your computer and use it in GitHub Desktop.
Save kastnerkyle/7948151 to your computer and use it in GitHub Desktop.
Updated KMeansCoder from @vene for sklearn 0.14
# Authors: Olivier Grisel <[email protected]>
# James Bergstra <[email protected]>
# Vlad Niculae <[email protected]>
#
# License: BSD 3 Clause
# Updated to sklearn 0.14 by Kyle Kastner <[email protected]>
import numpy as np
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
class KMeansCoder(BaseEstimator, TransformerMixin):
"""K-means based dictionary learning
The fit method receives an array of signals, whitens them using
a PCA transform and run a KMeans algorithm to extract the patch
cluster centers.
The input is then correlated with each individual "patch-center"
treated as a convolution kernel. The transformation can be done
using various sparse coding methods, tresholding or the triangle
k-means non-linearity.
This estimator only implements the unsupervised feature extraction
part of the referenced paper. Image classification can then be
performed by training a linear SVM model on the output of this
estimator.
Parameters
----------
n_atoms: int,
number of centers extracted by the kmeans algorithm
whiten: boolean, optional: default True
perform a whitening PCA on the data at feature extraction time
n_components: int, optional: default None
number of components to keep after whitening individual samples
max_iter: int, default 100
maximum number of iterations to run the k-means algorithm
n_init, int, default 1
number of times to initialize the k-means algorithm in order to
avoid convergence to local optima
n_prefit: int, default 5
dimension of reduced curriculum space in which to prefit the k-means
algorithm for increased performance.
This is used only when `whiten=True`.
tol: float, default 1e-4
tolerance for numerical errors
local_contrast: boolean, optional: default True
perform local contrast normalization on the extracted patch
verbose: bool, default False
whether to display verbose output
transform_algorithm: {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
Algorithm used to transform the data.
lars: uses the least angle regression method (linear_model.lars_path)
lasso_lars: uses Lars to compute the Lasso solution
lasso_cd: uses the coordinate descent method to compute the
Lasso solution (linear_model.Lasso). lasso_lars will be faster if
the estimated components are sparse.
omp: uses orthogonal matching pursuit to estimate the sparse solution
threshold: squashes to zero all coefficients less than alpha from
the projection X.T * Y
transform_n_nonzero_coefs: int, 0.1 * n_features by default
Number of nonzero coefficients to target in each column of the
solution. This is only used by `algorithm='lars'` and `algorithm='omp'`
and is overridden by `alpha` in the `omp` case.
transform_alpha: float, 1. by default
If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the
penalty applied to the L1 norm.
If `algorithm='threshold'`, `alpha` is the absolute value of the
threshold below which coefficients will be squashed to zero.
If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of
the reconstruction error targeted. In this case, it overrides
`n_nonzero_coefs`.
split_sign: bool, default False
whether to split the transformed feature vectors into positive and
negative components, such that the downstream classification algorithms
can assign different weights depending on the sign
n_jobs: int,
number of parallel jobs to run
Attributes
----------
components_: array of shape n_atoms, n_features
centers extracted by k-means from the patch space
Reference
---------
An Analysis of Single-Layer Networks in Unsupervised Feature Learning
Adam Coates, Honglak Lee and Andrew Ng. In NIPS*2010 Workshop on
Deep Learning and Unsupervised Feature Learning.
http://robotics.stanford.edu/~ang/papers/nipsdlufl10-AnalysisSingleLayerUnsupervisedFeatureLearning.pdf
"""
def __init__(self, n_atoms, whiten=True, n_components=None,
max_iter=100, n_init=1, n_prefit=5, tol=1e-4,
local_contrast=True, n_drop_components=0, verbose=False,
transform_algorithm='omp', transform_n_nonzero_coefs=None,
transform_alpha=None, split_sign=False, n_jobs=1):
self.n_atoms = n_atoms
self.whiten = whiten
self.max_iter = max_iter
self.n_init = n_init
self.n_components = n_components
self.local_contrast = local_contrast
self.n_prefit = n_prefit
self.verbose = verbose
self.tol = tol
self.transform_algorithm = transform_algorithm
self.transform_n_nonzero_coefs = transform_n_nonzero_coefs
self.transform_alpha = transform_alpha
self.split_sign = split_sign
self.n_jobs = n_jobs
def local_contrast_normalization(self, X):
"""Normalize the patch-wise variance of the signal
Parameters
----------
X: array-like, shape n_samples, n_features
Data to be normalized
Returns
-------
X:
Data after individual normalization of the samples
"""
# XXX: this should probably be extracted somewhere more general
# center all colour channels together
X = X.reshape((X.shape[0], -1))
X -= X.mean(axis=1)[:, None]
X_std = X.std(axis=1)
# Cap the divisor to avoid amplifying samples that are essentially
# a flat surface into full-contrast salt-and-pepper garbage.
# the actual value is a wild guess
# This trick is credited to N. Pinto
min_divisor = (2 * X_std.min() + X_std.mean()) / 3
X /= np.maximum(min_divisor, X_std).reshape(
(X.shape[0], 1))
return X
def fit(self, X, y=None, **kwargs):
"""Fit the encoder on a collection of data, e.g. image patches.
Parameters
----------
X: array-like, shape: n_samples, n_features
the patch data to be fitted
Returns
-------
self: object
Returns the object itself
"""
X = np.atleast_2d(X)
n_samples, n_features = X.shape
# normalize each patch individually
if self.local_contrast:
if self.verbose:
print "Local contrast normalization of the data"
X = self.local_contrast_normalization(X)
# kmeans model to find the filters
if self.verbose:
print "About to extract atoms from %d samples" % n_samples
kmeans = KMeans(n_clusters=self.n_atoms, init='k-means++',
max_iter=self.max_iter, n_init=self.n_init,
tol=self.tol, verbose=self.verbose)
if self.whiten:
if self.verbose:
print "Whitening PCA of the samples"
self.pca = pca = PCA(whiten=True, n_components=self.n_components)
pca.fit(X)
X = pca.transform(X)
# compute the KMeans centers
if 0 < self.n_prefit < pca.n_components:
if self.verbose:
print "First KMeans in simplified curriculum space"
# starting the kmeans on a the projection to the first singular
# components: curriculum learning trick by Andrej Karpathy
kmeans.fit(X[:, :self.n_prefit])
# warm restart by padding previous centroids with zeros
# with full dimensionality this time
kmeans.init = np.zeros((self.n_atoms, pca.n_components),
dtype=kmeans.cluster_centers_.dtype)
kmeans.init[:, :self.n_prefit] = kmeans.cluster_centers_
if self.verbose:
print "Second KMeans in full whitened sample space"
kmeans.set_params(n_init=1).fit(X)
else:
if self.verbose:
print "KMeans in full original sample space"
# regular kmeans fit (without the curriculum trick)
kmeans.fit(X)
# project back the centers in original, non-whitened space (useful
# for qualitative inspection of the filters)
self.components_ = self.pca.inverse_transform(
kmeans.cluster_centers_)
else:
# find the kernel in the raw original dimensional space
# TODO: experiment with component wise scaling too
self.pca = None
kmeans.fit(X)
self.components_ = kmeans.cluster_centers_
self.kmeans = kmeans
self.inertia_ = kmeans.inertia_
return self
def transform(self, X, y=None):
"""Map a collection of samples into the feature space
""" + BaseDictionaryLearning.transform.__doc__
if self.local_contrast:
# TODO: make it inplace by default explictly
X = self.local_contrast_normalization(X)
return BaseDictionaryLearning.transform(self, X, y)
if __name__ == '__main__':
"""
Dictionary learning with K-Means on faces image data
====================================================
This shows 400 dictionary atoms learned from 6x6 image patches extracted from
the face recognition dataset. The dictionary atoms are learned using
(:ref:`KMeansCoder`), with and respectively without a whitening PCA transform.
The dataset used in this example is a preprocessed excerpt of the
"Labeled Faces in the Wild", aka LFW_:
http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)
.. _LFW: http://vis-www.cs.umass.edu/lfw/
.. |kc_no_w| image:: /images/plot_kmeans_coder_1.png
:scale: 50%
.. |kc_w| image:: /images/plot_kmeans_coder_2.png
:scale: 50%
.. centered:: |kc_no_w| |kc_w|
"""
print __doc__
from time import time
import logging
import pylab as pl
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.datasets import fetch_lfw_people
from sklearn.feature_extraction.image import PatchExtractor
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
###############################################################################
# Download the data, if not already on disk and load it as numpy arrays
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
# reshape the data using the traditional (n_samples, n_features) shape
faces = lfw_people.data
n_samples, h, w = lfw_people.images.shape
X = faces
X -= X.mean(axis=1)[:, np.newaxis]
n_features = X.shape[1]
X = X.reshape((n_samples, h, w))
# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
print "Total dataset size:"
print "n_samples: %d" % n_samples
print "n_features: %d" % n_features
print "n_classes: %d" % n_classes
###############################################################################
# Split into a training set and a test set using a stratified k fold
train, test = iter(StratifiedKFold(y, n_folds=4)).next()
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_atoms = 400
print "Extracting image patches from %d faces" % len(X_train)
t0 = time()
extr = PatchExtractor(patch_size=(6, 6), max_patches=100, random_state=0)
patches = extr.transform(X_train)
print "done in %0.3fs" % (time() - t0)
print "Extracting %d atoms from %d patches" % (
n_atoms, len(patches))
t0 = time()
patches = patches.reshape((patches.shape[0], patches.shape[1] * patches.shape[2]))
kc1 = KMeansCoder(n_atoms, max_iter=5, verbose=True, whiten=False).fit(patches)
print "done in %0.3fs" % (time() - t0)
print "Extracting %d whitened atoms from %d patches" % (
n_atoms, len(patches))
t0 = time()
kc2 = KMeansCoder(n_atoms, max_iter=5, verbose=True, whiten=True).fit(patches)
print "done in %0.3fs" % (time() - t0)
###############################################################################
# Qualitative evaluation of the extracted filters
n_row = int(np.sqrt(n_atoms))
n_col = int(np.sqrt(n_atoms))
titles = ["without whitening PCA", "with whitening PCA"]
for img_index, components in enumerate((kc1.components_, kc2.components_)):
pl.figure(figsize=(5, 6))
pl.suptitle("Dictionary learned with K-Means on the \n LFW dataset " +
titles[img_index])
for i, atom in enumerate(components):
pl.subplot(n_row, n_col, i + 1)
pl.imshow(atom.reshape((6, 6)), cmap=pl.cm.gray,
interpolation="nearest")
pl.xticks(())
pl.yticks(())
pl.subplots_adjust(0.02, 0.03, 0.98, 0.90, 0.14, 0.01)
pl.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment