Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Created December 14, 2012 19:19
Show Gist options
  • Save ogrisel/4287894 to your computer and use it in GitHub Desktop.
Save ogrisel/4287894 to your computer and use it in GitHub Desktop.
Scratchpad for feature selection for clustering using a consensus ensemble method.
joblib
*.pyc
from optparse import OptionParser
from time import time
import numpy as np
import pylab as pl
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans
from sklearn.externals.joblib import Memory
from sklearn.utils.sparsefuncs import inplace_csr_column_scale
# Parse commandline arguments
op = OptionParser()
op.add_option("--n-random-runs", default=5,
type=int, help="Number of random runs")
op.add_option("--no-idf",
action="store_false", dest="use_idf", default=True,
help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
action="store_true", default=False,
help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
help="Maximum number of features (dimensions) to extract from text.")
op.add_option("--seed", type=int,
help="Seed for the Random Number Generator.")
op.add_option("--min-n-clusters", type=int, default=2)
op.add_option("--max-n-clusters", type=int, default=10)
op.add_option("--use-random-data", action="store_true", default=False,
help="Use random data instead of 20 newsgroups as a sanity check.")
op.add_option("--pca-reduce-to", type=int, default=-1,
help="Reduce the dimensionality using PCA.")
(opts, args) = op.parse_args()
rng = np.random.RandomState(opts.seed)
m = Memory('.', mmap_mode='c')
# Load and vectorize the data
print "Loading 20 newsgroups dataset for categories:"
categories = [
'alt.atheism',
# 'comp.graphics',
# 'comp.os.ms-windows.misc',
# 'comp.sys.ibm.pc.hardware',
# 'comp.sys.mac.hardware',
# 'comp.windows.x',
# 'misc.forsale',
# 'rec.autos',
# 'rec.motorcycles',
'rec.sport.baseball',
# 'rec.sport.hockey',
# 'sci.crypt',
# 'sci.electronics',
'sci.med',
# 'sci.space',
# 'soc.religion.christian',
'talk.politics.guns',
# 'talk.politics.mideast',
# 'talk.politics.misc',
# 'talk.religion.misc'
]
#categories = None
print categories
if opts.use_hashing:
if opts.use_idf:
# Perform an IDF normalization on the output of HashingVectorizer
hasher = HashingVectorizer(n_features=opts.n_features,
stop_words='english', non_negative=True,
norm=None, binary=False)
vectorizer = Pipeline((
('hasher', hasher),
('tf_idf', TfidfTransformer())
))
else:
vectorizer = HashingVectorizer(n_features=opts.n_features,
stop_words='english', non_negative=False,
norm='l2', binary=False)
else:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
stop_words='english', use_idf=opts.use_idf)
@m.cache()
def load_vectorized_data(categories, vectorizer, random_state=42):
print "Extracting features from the training dataset using a sparse vectorizer"
t0 = time()
dataset = fetch_20newsgroups(
subset='all', categories=categories, shuffle=True,
random_state=random_state)
X = vectorizer.fit_transform(dataset.data)
print "done in %fs" % (time() - t0)
print
return X, dataset
if opts.use_random_data:
X = rng.normal(0, 1, size=(100, 100))
labels = rng.randint(4, size=X.shape[0])
else:
X, dataset = load_vectorized_data(categories, vectorizer)
labels = dataset.target
print "%d categories" % len(dataset.target_names)
if opts.pca_reduce_to > 0:
print "Reducing dimensionality from %d to %d using PCA" % (
X.shape[1], opts.pca_reduce_to)
X = RandomizedPCA(opts.pca_reduce_to, whiten=True).fit_transform(X)
print "n_samples: %d, n_features: %d" % X.shape
n_categories = np.unique(labels).shape[0]
print "n_categories: %d" % n_categories
n_samples, n_features = X.shape
# Let's rescale features randomly and compute the clustering stability for various
# size of n_clusters
ground_truth_scores = []
consensus_scores = []
n_clusters_range = np.arange(opts.min_n_clusters, opts.max_n_clusters)
for n_clusters in n_clusters_range:
print "Computing randomized clusterings: n_clusters=%d" % n_clusters
clusterings = []
km_seed = rng.randint(100000)
scale_width = 0.03
gt_scores = []
for run_idx in range(opts.n_random_runs):
feature_scales = 1. + rng.uniform(
low=-scale_width / 2, high=scale_width / 2, size=n_features)
if hasattr(X, 'toarray'):
X_rescaled = X.copy()
inplace_csr_column_scale(X_rescaled, feature_scales)
else:
X_rescaled = X / feature_scales.reshape((1, -1))
km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
n_init=1, init_size=1000, batch_size=1000,
verbose=0, random_state=None)
km.fit(X_rescaled)
clusterings.append(km.labels_)
ground_truth_score = metrics.adjusted_rand_score(km.labels_, labels)
gt_scores.append(ground_truth_score)
ground_truth_scores.append(gt_scores)
scores = []
for i, c_i in enumerate(clusterings):
for j, c_j in enumerate(clusterings):
if j > i:
score = metrics.adjusted_rand_score(c_i, c_j)
scores.append(score)
consensus_scores.append(scores)
print "Consensus: %f +/-%f - Ground truth: %f +/-%f" % (
np.mean(scores),
np.std(scores),
np.mean(gt_scores),
np.std(gt_scores),
)
pl.boxplot(consensus_scores, notch=1)
pl.xticks(np.arange(n_clusters_range.shape[0]) + 1, n_clusters_range)
pl.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment