Created
December 14, 2012 19:19
-
-
Save ogrisel/4287894 to your computer and use it in GitHub Desktop.
Scratchpad for feature selection for clustering using a consensus ensemble method.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
joblib | |
*.pyc |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from optparse import OptionParser | |
from time import time | |
import numpy as np | |
import pylab as pl | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.decomposition import RandomizedPCA | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.feature_extraction.text import HashingVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn import metrics | |
from sklearn.cluster import MiniBatchKMeans | |
from sklearn.externals.joblib import Memory | |
from sklearn.utils.sparsefuncs import inplace_csr_column_scale | |
# Parse commandline arguments | |
op = OptionParser() | |
op.add_option("--n-random-runs", default=5, | |
type=int, help="Number of random runs") | |
op.add_option("--no-idf", | |
action="store_false", dest="use_idf", default=True, | |
help="Disable Inverse Document Frequency feature weighting.") | |
op.add_option("--use-hashing", | |
action="store_true", default=False, | |
help="Use a hashing feature vectorizer") | |
op.add_option("--n-features", type=int, default=10000, | |
help="Maximum number of features (dimensions) to extract from text.") | |
op.add_option("--seed", type=int, | |
help="Seed for the Random Number Generator.") | |
op.add_option("--min-n-clusters", type=int, default=2) | |
op.add_option("--max-n-clusters", type=int, default=10) | |
op.add_option("--use-random-data", action="store_true", default=False, | |
help="Use random data instead of 20 newsgroups as a sanity check.") | |
op.add_option("--pca-reduce-to", type=int, default=-1, | |
help="Reduce the dimensionality using PCA.") | |
(opts, args) = op.parse_args() | |
rng = np.random.RandomState(opts.seed) | |
m = Memory('.', mmap_mode='c') | |
# Load and vectorize the data | |
print "Loading 20 newsgroups dataset for categories:" | |
categories = [ | |
'alt.atheism', | |
# 'comp.graphics', | |
# 'comp.os.ms-windows.misc', | |
# 'comp.sys.ibm.pc.hardware', | |
# 'comp.sys.mac.hardware', | |
# 'comp.windows.x', | |
# 'misc.forsale', | |
# 'rec.autos', | |
# 'rec.motorcycles', | |
'rec.sport.baseball', | |
# 'rec.sport.hockey', | |
# 'sci.crypt', | |
# 'sci.electronics', | |
'sci.med', | |
# 'sci.space', | |
# 'soc.religion.christian', | |
'talk.politics.guns', | |
# 'talk.politics.mideast', | |
# 'talk.politics.misc', | |
# 'talk.religion.misc' | |
] | |
#categories = None | |
print categories | |
if opts.use_hashing: | |
if opts.use_idf: | |
# Perform an IDF normalization on the output of HashingVectorizer | |
hasher = HashingVectorizer(n_features=opts.n_features, | |
stop_words='english', non_negative=True, | |
norm=None, binary=False) | |
vectorizer = Pipeline(( | |
('hasher', hasher), | |
('tf_idf', TfidfTransformer()) | |
)) | |
else: | |
vectorizer = HashingVectorizer(n_features=opts.n_features, | |
stop_words='english', non_negative=False, | |
norm='l2', binary=False) | |
else: | |
vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, | |
stop_words='english', use_idf=opts.use_idf) | |
@m.cache() | |
def load_vectorized_data(categories, vectorizer, random_state=42): | |
print "Extracting features from the training dataset using a sparse vectorizer" | |
t0 = time() | |
dataset = fetch_20newsgroups( | |
subset='all', categories=categories, shuffle=True, | |
random_state=random_state) | |
X = vectorizer.fit_transform(dataset.data) | |
print "done in %fs" % (time() - t0) | |
return X, dataset | |
if opts.use_random_data: | |
X = rng.normal(0, 1, size=(100, 100)) | |
labels = rng.randint(4, size=X.shape[0]) | |
else: | |
X, dataset = load_vectorized_data(categories, vectorizer) | |
labels = dataset.target | |
print "%d categories" % len(dataset.target_names) | |
if opts.pca_reduce_to > 0: | |
print "Reducing dimensionality from %d to %d using PCA" % ( | |
X.shape[1], opts.pca_reduce_to) | |
X = RandomizedPCA(opts.pca_reduce_to, whiten=True).fit_transform(X) | |
print "n_samples: %d, n_features: %d" % X.shape | |
n_categories = np.unique(labels).shape[0] | |
print "n_categories: %d" % n_categories | |
n_samples, n_features = X.shape | |
# Let's rescale features randomly and compute the clustering stability for various | |
# size of n_clusters | |
ground_truth_scores = [] | |
consensus_scores = [] | |
n_clusters_range = np.arange(opts.min_n_clusters, opts.max_n_clusters) | |
for n_clusters in n_clusters_range: | |
print "Computing randomized clusterings: n_clusters=%d" % n_clusters | |
clusterings = [] | |
km_seed = rng.randint(100000) | |
scale_width = 0.03 | |
gt_scores = [] | |
for run_idx in range(opts.n_random_runs): | |
feature_scales = 1. + rng.uniform( | |
low=-scale_width / 2, high=scale_width / 2, size=n_features) | |
if hasattr(X, 'toarray'): | |
X_rescaled = X.copy() | |
inplace_csr_column_scale(X_rescaled, feature_scales) | |
else: | |
X_rescaled = X / feature_scales.reshape((1, -1)) | |
km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', | |
n_init=1, init_size=1000, batch_size=1000, | |
verbose=0, random_state=None) | |
km.fit(X_rescaled) | |
clusterings.append(km.labels_) | |
ground_truth_score = metrics.adjusted_rand_score(km.labels_, labels) | |
gt_scores.append(ground_truth_score) | |
ground_truth_scores.append(gt_scores) | |
scores = [] | |
for i, c_i in enumerate(clusterings): | |
for j, c_j in enumerate(clusterings): | |
if j > i: | |
score = metrics.adjusted_rand_score(c_i, c_j) | |
scores.append(score) | |
consensus_scores.append(scores) | |
print "Consensus: %f +/-%f - Ground truth: %f +/-%f" % ( | |
np.mean(scores), | |
np.std(scores), | |
np.mean(gt_scores), | |
np.std(gt_scores), | |
) | |
pl.boxplot(consensus_scores, notch=1) | |
pl.xticks(np.arange(n_clusters_range.shape[0]) + 1, n_clusters_range) | |
pl.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment