Created
February 7, 2013 22:19
-
-
Save balamuru/4734765 to your computer and use it in GitHub Desktop.
Incremental Mini KMeans Clustering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Peter Prettenhofer <[email protected]> | |
# Lars Buitinck <[email protected]> | |
# License: Simplified BSD | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.feature_extraction.text import HashingVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import FeatureHasher | |
from sklearn.pipeline import Pipeline | |
from sklearn import metrics | |
from sklearn.cluster import KMeans, MiniBatchKMeans | |
import logging | |
from optparse import OptionParser | |
import sys | |
from time import time | |
import numpy as np | |
import os | |
import urllib | |
import logging | |
import tarfile | |
import pickle | |
import shutil | |
from os import environ | |
from os.path import dirname | |
from os.path import join | |
from os.path import exists | |
from os.path import expanduser | |
from os.path import isdir | |
from os import listdir | |
from os import makedirs | |
files_per_chunk = 10 | |
def iter_documents(top_directory, max_files_per_chunk): | |
#Iterate over all documents, yielding a document (=list of utf8 tokens) at a time. | |
# file_list = [] | |
dict = {} | |
for root, dirs, files in os.walk(top_directory): | |
for file in filter(lambda file: file.endswith('.txt'), files): | |
document = open(os.path.join(root, file)).read() # read the entire document, as one big string | |
#handle document | |
# file_list.append(document) | |
dict[file] = document | |
# if len(file_list) >= max_files_per_chunk: | |
if len(dict) >= max_files_per_chunk: | |
# yield file_list | |
yield dict | |
# file_list[:] = [] | |
dict = {} | |
# yield file_list | |
yield dict | |
# Display progress logs on stdout | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s %(levelname)s %(message)s') | |
## parse commandline arguments | |
op = OptionParser() | |
op.add_option("--no-minibatch", | |
action="store_false", dest="minibatch", default=True, | |
help="Use ordinary k-means algorithm (in batch mode).") | |
op.add_option("--no-idf", | |
action="store_false", dest="use_idf", default=True, | |
help="Disable Inverse Document Frequency feature weighting.") | |
op.add_option("--use-hashing", | |
action="store_true", default=False, | |
help="Use a hashing feature vectorizer") | |
op.add_option("--n-features", type=int, default=10000, | |
help="Maximum number of features (dimensions)" | |
"to extract from text.") | |
print __doc__ | |
op.print_help() | |
(opts, args) = op.parse_args() | |
if len(args) > 0: | |
op.error("this script takes no arguments.") | |
sys.exit(1) | |
# Uncomment the following to do the analysis on all the categories | |
#categories = None | |
############################# | |
vectorizer = HashingVectorizer(n_features=opts.n_features, | |
stop_words='english', | |
non_negative=False, norm='l2', | |
binary=False) | |
num_clusters = 5 | |
km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, | |
init_size=1000, | |
batch_size=1000, verbose=1) | |
file_names = [] | |
for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk): | |
# add the docs in chunks of size 'files_per_chunk' | |
file_names.extend(doc_dict.keys()) | |
X_transform_counts = vectorizer.transform(doc_dict.values()) | |
#X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values()) NOT NEEDED | |
#fit this chunk of data | |
km.partial_fit(X_transform_counts) | |
print "## counts: " + str(X_transform_counts.shape) | |
# for doc in doc_list: | |
# print doc | |
print vectorizer | |
#for f in file_names: | |
# print "file name " + str(f) | |
for cluster_id in range(0, km.n_clusters): | |
indices = np.where(km.labels_ == cluster_id) | |
if len(indices) > 0: | |
print "Indices " + str(indices) | |
cluster_doc_filenames = file_names[np.where(km.labels_ == cluster_id)] #<=====FAIL | |
for cluster_doc_filename in cluster_doc_filenames: | |
print str(cluster_id) +" : " + cluster_doc_filename | |
else: | |
print "empty indices" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment