ogrisel · December 14, 2012 19:19
diff --git a/.gitignore b/.gitignore
 joblib
 *.pyc
diff --git a/clustering_feature_selection.py b/clustering_feature_selection.py

 from optparse import OptionParser
 from time import time

 import numpy as np
 import pylab as pl

 from sklearn.datasets import fetch_20newsgroups
 from sklearn.decomposition import RandomizedPCA
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.pipeline import Pipeline
 from sklearn import metrics
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.externals.joblib import Memory
 from sklearn.utils.sparsefuncs import inplace_csr_column_scale



 # Parse commandline arguments

 op = OptionParser()
 op.add_option("--n-random-runs", default=5,
              type=int, help="Number of random runs")
 op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
 op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
 op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions) to extract from text.")
 op.add_option("--seed", type=int,
              help="Seed for the Random Number Generator.")
 op.add_option("--min-n-clusters", type=int, default=2)
 op.add_option("--max-n-clusters", type=int, default=10)
 op.add_option("--use-random-data", action="store_true", default=False,
              help="Use random data instead of 20 newsgroups as a sanity check.")
 op.add_option("--pca-reduce-to", type=int, default=-1,
              help="Reduce the dimensionality using PCA.")

 (opts, args) = op.parse_args()

 rng = np.random.RandomState(opts.seed)

 m = Memory('.', mmap_mode='c')

 # Load and vectorize the data

 print "Loading 20 newsgroups dataset for categories:"
 categories = [
    'alt.atheism',
 #    'comp.graphics',
 #    'comp.os.ms-windows.misc',
 #    'comp.sys.ibm.pc.hardware',
 #    'comp.sys.mac.hardware',
 #    'comp.windows.x',
 #    'misc.forsale',
 #    'rec.autos',
 #    'rec.motorcycles',
    'rec.sport.baseball',
 #    'rec.sport.hockey',
 #    'sci.crypt',
 #    'sci.electronics',
    'sci.med',
 #    'sci.space',
 #    'soc.religion.christian',
    'talk.politics.guns',
 #    'talk.politics.mideast',
 #    'talk.politics.misc',
 #    'talk.religion.misc'
 ]
 #categories = None
 print categories

 if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english', non_negative=True,
                                   norm=None, binary=False)
        vectorizer = Pipeline((
            ('hasher', hasher),
            ('tf_idf', TfidfTransformer())
        ))
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english', non_negative=False,
                                   norm='l2', binary=False)
 else:
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
                                 stop_words='english', use_idf=opts.use_idf)

 @m.cache()
 def load_vectorized_data(categories, vectorizer, random_state=42):
    print "Extracting features from the training dataset using a sparse vectorizer"
    t0 = time()
    dataset = fetch_20newsgroups(
        subset='all', categories=categories, shuffle=True,
        random_state=random_state)
    X = vectorizer.fit_transform(dataset.data)
    print "done in %fs" % (time() - t0)
    print

    return X, dataset

 if opts.use_random_data:
    X = rng.normal(0, 1, size=(100, 100))
    labels = rng.randint(4, size=X.shape[0])

 else:
    X, dataset = load_vectorized_data(categories, vectorizer)
    labels = dataset.target
    print "%d categories" % len(dataset.target_names)


 if opts.pca_reduce_to > 0:
    print "Reducing dimensionality from %d to %d using PCA" % (
        X.shape[1], opts.pca_reduce_to)
    X = RandomizedPCA(opts.pca_reduce_to, whiten=True).fit_transform(X)

 print "n_samples: %d, n_features: %d" % X.shape
 n_categories = np.unique(labels).shape[0]
 print "n_categories: %d" % n_categories
 n_samples, n_features = X.shape

 # Let's rescale features randomly and compute the clustering stability for various
 # size of n_clusters

 ground_truth_scores = []
 consensus_scores = []
 n_clusters_range = np.arange(opts.min_n_clusters, opts.max_n_clusters)
 for n_clusters in n_clusters_range:
    print "Computing randomized clusterings: n_clusters=%d" % n_clusters
    clusterings = []
    km_seed = rng.randint(100000)
    scale_width = 0.03
    gt_scores = []
    for run_idx in range(opts.n_random_runs):
        feature_scales = 1. + rng.uniform(
            low=-scale_width / 2, high=scale_width / 2, size=n_features)

        if hasattr(X, 'toarray'):
            X_rescaled = X.copy()
            inplace_csr_column_scale(X_rescaled, feature_scales)
        else:
            X_rescaled = X / feature_scales.reshape((1, -1))
        km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
                             n_init=1, init_size=1000, batch_size=1000,
                             verbose=0, random_state=None)
        km.fit(X_rescaled)
        clusterings.append(km.labels_)

        ground_truth_score = metrics.adjusted_rand_score(km.labels_, labels)
        gt_scores.append(ground_truth_score)
    ground_truth_scores.append(gt_scores)

    scores = []
    for i, c_i in enumerate(clusterings):
        for j, c_j in enumerate(clusterings):
            if j > i:
                score = metrics.adjusted_rand_score(c_i, c_j)
                scores.append(score)
    consensus_scores.append(scores)

    print "Consensus: %f +/-%f - Ground truth: %f +/-%f" % (
        np.mean(scores),
        np.std(scores),
        np.mean(gt_scores),
        np.std(gt_scores),
    )

 pl.boxplot(consensus_scores, notch=1)
 pl.xticks(np.arange(n_clusters_range.shape[0]) + 1, n_clusters_range)
 pl.show()

	from optparse import OptionParser
	from time import time

	import numpy as np
	import pylab as pl

	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import RandomizedPCA
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import HashingVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.pipeline import Pipeline
	from sklearn import metrics
	from sklearn.cluster import MiniBatchKMeans
	from sklearn.externals.joblib import Memory
	from sklearn.utils.sparsefuncs import inplace_csr_column_scale



	# Parse commandline arguments

	op = OptionParser()
	op.add_option("--n-random-runs", default=5,
	type=int, help="Number of random runs")
	op.add_option("--no-idf",
	action="store_false", dest="use_idf", default=True,
	help="Disable Inverse Document Frequency feature weighting.")
	op.add_option("--use-hashing",
	action="store_true", default=False,
	help="Use a hashing feature vectorizer")
	op.add_option("--n-features", type=int, default=10000,
	help="Maximum number of features (dimensions) to extract from text.")
	op.add_option("--seed", type=int,
	help="Seed for the Random Number Generator.")
	op.add_option("--min-n-clusters", type=int, default=2)
	op.add_option("--max-n-clusters", type=int, default=10)
	op.add_option("--use-random-data", action="store_true", default=False,
	help="Use random data instead of 20 newsgroups as a sanity check.")
	op.add_option("--pca-reduce-to", type=int, default=-1,
	help="Reduce the dimensionality using PCA.")

	(opts, args) = op.parse_args()

	rng = np.random.RandomState(opts.seed)

	m = Memory('.', mmap_mode='c')

	# Load and vectorize the data

	print "Loading 20 newsgroups dataset for categories:"
	categories = [
	'alt.atheism',
	# 'comp.graphics',
	# 'comp.os.ms-windows.misc',
	# 'comp.sys.ibm.pc.hardware',
	# 'comp.sys.mac.hardware',
	# 'comp.windows.x',
	# 'misc.forsale',
	# 'rec.autos',
	# 'rec.motorcycles',
	'rec.sport.baseball',
	# 'rec.sport.hockey',
	# 'sci.crypt',
	# 'sci.electronics',
	'sci.med',
	# 'sci.space',
	# 'soc.religion.christian',
	'talk.politics.guns',
	# 'talk.politics.mideast',
	# 'talk.politics.misc',
	# 'talk.religion.misc'
	]
	#categories = None
	print categories

	if opts.use_hashing:
	if opts.use_idf:
	# Perform an IDF normalization on the output of HashingVectorizer
	hasher = HashingVectorizer(n_features=opts.n_features,
	stop_words='english', non_negative=True,
	norm=None, binary=False)
	vectorizer = Pipeline((
	('hasher', hasher),
	('tf_idf', TfidfTransformer())
	))
	else:
	vectorizer = HashingVectorizer(n_features=opts.n_features,
	stop_words='english', non_negative=False,
	norm='l2', binary=False)
	else:
	vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
	stop_words='english', use_idf=opts.use_idf)

	@m.cache()
	def load_vectorized_data(categories, vectorizer, random_state=42):
	print "Extracting features from the training dataset using a sparse vectorizer"
	t0 = time()
	dataset = fetch_20newsgroups(
	subset='all', categories=categories, shuffle=True,
	random_state=random_state)
	X = vectorizer.fit_transform(dataset.data)
	print "done in %fs" % (time() - t0)
	print

	return X, dataset

	if opts.use_random_data:
	X = rng.normal(0, 1, size=(100, 100))
	labels = rng.randint(4, size=X.shape[0])

	else:
	X, dataset = load_vectorized_data(categories, vectorizer)
	labels = dataset.target
	print "%d categories" % len(dataset.target_names)


	if opts.pca_reduce_to > 0:
	print "Reducing dimensionality from %d to %d using PCA" % (
	X.shape[1], opts.pca_reduce_to)
	X = RandomizedPCA(opts.pca_reduce_to, whiten=True).fit_transform(X)

	print "n_samples: %d, n_features: %d" % X.shape
	n_categories = np.unique(labels).shape[0]
	print "n_categories: %d" % n_categories
	n_samples, n_features = X.shape

	# Let's rescale features randomly and compute the clustering stability for various
	# size of n_clusters

	ground_truth_scores = []
	consensus_scores = []
	n_clusters_range = np.arange(opts.min_n_clusters, opts.max_n_clusters)
	for n_clusters in n_clusters_range:
	print "Computing randomized clusterings: n_clusters=%d" % n_clusters
	clusterings = []
	km_seed = rng.randint(100000)
	scale_width = 0.03
	gt_scores = []
	for run_idx in range(opts.n_random_runs):
	feature_scales = 1. + rng.uniform(
	low=-scale_width / 2, high=scale_width / 2, size=n_features)

	if hasattr(X, 'toarray'):
	X_rescaled = X.copy()
	inplace_csr_column_scale(X_rescaled, feature_scales)
	else:
	X_rescaled = X / feature_scales.reshape((1, -1))
	km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
	n_init=1, init_size=1000, batch_size=1000,
	verbose=0, random_state=None)
	km.fit(X_rescaled)
	clusterings.append(km.labels_)

	ground_truth_score = metrics.adjusted_rand_score(km.labels_, labels)
	gt_scores.append(ground_truth_score)
	ground_truth_scores.append(gt_scores)

	scores = []
	for i, c_i in enumerate(clusterings):
	for j, c_j in enumerate(clusterings):
	if j > i:
	score = metrics.adjusted_rand_score(c_i, c_j)
	scores.append(score)
	consensus_scores.append(scores)

	print "Consensus: %f +/-%f - Ground truth: %f +/-%f" % (
	np.mean(scores),
	np.std(scores),
	np.mean(gt_scores),
	np.std(gt_scores),
	)

	pl.boxplot(consensus_scores, notch=1)
	pl.xticks(np.arange(n_clusters_range.shape[0]) + 1, n_clusters_range)
	pl.show()