Skip to content

Instantly share code, notes, and snippets.

@balamuru
Created February 7, 2013 22:19
Show Gist options
  • Save balamuru/4734765 to your computer and use it in GitHub Desktop.
Save balamuru/4734765 to your computer and use it in GitHub Desktop.
Incremental Mini KMeans Clustering
# Author: Peter Prettenhofer <[email protected]>
# Lars Buitinck <[email protected]>
# License: Simplified BSD
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import FeatureHasher
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
import os
import urllib
import logging
import tarfile
import pickle
import shutil
from os import environ
from os.path import dirname
from os.path import join
from os.path import exists
from os.path import expanduser
from os.path import isdir
from os import listdir
from os import makedirs
files_per_chunk = 10
def iter_documents(top_directory, max_files_per_chunk):
#Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.
# file_list = []
dict = {}
for root, dirs, files in os.walk(top_directory):
for file in filter(lambda file: file.endswith('.txt'), files):
document = open(os.path.join(root, file)).read() # read the entire document, as one big string
#handle document
# file_list.append(document)
dict[file] = document
# if len(file_list) >= max_files_per_chunk:
if len(dict) >= max_files_per_chunk:
# yield file_list
yield dict
# file_list[:] = []
dict = {}
# yield file_list
yield dict
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
## parse commandline arguments
op = OptionParser()
op.add_option("--no-minibatch",
action="store_false", dest="minibatch", default=True,
help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
action="store_false", dest="use_idf", default=True,
help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
action="store_true", default=False,
help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
help="Maximum number of features (dimensions)"
"to extract from text.")
print __doc__
op.print_help()
(opts, args) = op.parse_args()
if len(args) > 0:
op.error("this script takes no arguments.")
sys.exit(1)
# Uncomment the following to do the analysis on all the categories
#categories = None
#############################
vectorizer = HashingVectorizer(n_features=opts.n_features,
stop_words='english',
non_negative=False, norm='l2',
binary=False)
num_clusters = 5
km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
init_size=1000,
batch_size=1000, verbose=1)
file_names = []
for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk):
# add the docs in chunks of size 'files_per_chunk'
file_names.extend(doc_dict.keys())
X_transform_counts = vectorizer.transform(doc_dict.values())
#X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values()) NOT NEEDED
#fit this chunk of data
km.partial_fit(X_transform_counts)
print "## counts: " + str(X_transform_counts.shape)
# for doc in doc_list:
# print doc
print vectorizer
#for f in file_names:
# print "file name " + str(f)
for cluster_id in range(0, km.n_clusters):
indices = np.where(km.labels_ == cluster_id)
if len(indices) > 0:
print "Indices " + str(indices)
cluster_doc_filenames = file_names[np.where(km.labels_ == cluster_id)] #<=====FAIL
for cluster_doc_filename in cluster_doc_filenames:
print str(cluster_id) +" : " + cluster_doc_filename
else:
print "empty indices"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment