balamuru · December 12, 2015 05:59
diff --git a/gistfile1.py b/gistfile1.py
 # Author: Peter Prettenhofer <[email protected]>
 #         Lars Buitinck <[email protected]>
 # License: Simplified BSD

 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.feature_extraction.text import FeatureHasher
 from sklearn.pipeline import Pipeline
 from sklearn import metrics

 from sklearn.cluster import KMeans, MiniBatchKMeans

 import logging
 from optparse import OptionParser
 import sys
 from time import time

 import numpy as np
 import os
 import urllib
 import logging
 import tarfile
 import pickle
 import shutil
 from os import environ
 from os.path import dirname
 from os.path import join
 from os.path import exists
 from os.path import expanduser
 from os.path import isdir
 from os import listdir
 from os import makedirs

 files_per_chunk = 10

 def iter_documents(top_directory, max_files_per_chunk):
 #Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.
 #    file_list = []
    dict = {}
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file)).read() # read the entire document, as one big string
            #handle document
 #           file_list.append(document)
            dict[file] = document
 #            if len(file_list) >= max_files_per_chunk:
            if len(dict) >= max_files_per_chunk:
 #                yield file_list
                yield dict
 #                file_list[:] = []
                dict = {}

 #    yield file_list
    yield dict

 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

 ## parse commandline arguments
 op = OptionParser()
 op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
 op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
 op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
 op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions)"
                   "to extract from text.")

 print __doc__
 op.print_help()

 (opts, args) = op.parse_args()
 if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)
    
    

 # Uncomment the following to do the analysis on all the categories
 #categories = None

 #############################

 vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       non_negative=False, norm='l2',
                                       binary=False)

 for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk):
    # add the docs in chunks of size 'files_per_chunk'

    X_transform_counts = vectorizer.transform(doc_dict.values())
    #X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values())

    #Question 1: I don't know class information because this is an unsupervised learning (clustering) operation. Hence I can't perform a partial_fit
    #Question2 : WRT Question 1, What should I be passing into the clustering algorithm. I would first have to incrementally accumulate data in the vectorizer


    print "## counts: " + str(X_transform_counts.shape)     #<== I wont know the document class in advance for a clustering opeation

 #    for doc in doc_list:
 #        print doc

 print vectorizer
	# Author: Peter Prettenhofer <[email protected]>
	# Lars Buitinck <[email protected]>
	# License: Simplified BSD

	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import HashingVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.feature_extraction.text import FeatureHasher
	from sklearn.pipeline import Pipeline
	from sklearn import metrics

	from sklearn.cluster import KMeans, MiniBatchKMeans

	import logging
	from optparse import OptionParser
	import sys
	from time import time

	import numpy as np
	import os
	import urllib
	import logging
	import tarfile
	import pickle
	import shutil
	from os import environ
	from os.path import dirname
	from os.path import join
	from os.path import exists
	from os.path import expanduser
	from os.path import isdir
	from os import listdir
	from os import makedirs

	files_per_chunk = 10

	def iter_documents(top_directory, max_files_per_chunk):
	#Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.
	# file_list = []
	dict = {}
	for root, dirs, files in os.walk(top_directory):
	for file in filter(lambda file: file.endswith('.txt'), files):
	document = open(os.path.join(root, file)).read() # read the entire document, as one big string
	#handle document
	# file_list.append(document)
	dict[file] = document
	# if len(file_list) >= max_files_per_chunk:
	if len(dict) >= max_files_per_chunk:
	# yield file_list
	yield dict
	# file_list[:] = []
	dict = {}

	# yield file_list
	yield dict

	# Display progress logs on stdout
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s %(levelname)s %(message)s')

	## parse commandline arguments
	op = OptionParser()
	op.add_option("--no-minibatch",
	action="store_false", dest="minibatch", default=True,
	help="Use ordinary k-means algorithm (in batch mode).")
	op.add_option("--no-idf",
	action="store_false", dest="use_idf", default=True,
	help="Disable Inverse Document Frequency feature weighting.")
	op.add_option("--use-hashing",
	action="store_true", default=False,
	help="Use a hashing feature vectorizer")
	op.add_option("--n-features", type=int, default=10000,
	help="Maximum number of features (dimensions)"
	"to extract from text.")

	print __doc__
	op.print_help()

	(opts, args) = op.parse_args()
	if len(args) > 0:
	op.error("this script takes no arguments.")
	sys.exit(1)



	# Uncomment the following to do the analysis on all the categories
	#categories = None

	#############################

	vectorizer = HashingVectorizer(n_features=opts.n_features,
	stop_words='english',
	non_negative=False, norm='l2',
	binary=False)

	for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk):
	# add the docs in chunks of size 'files_per_chunk'

	X_transform_counts = vectorizer.transform(doc_dict.values())
	#X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values())

	#Question 1: I don't know class information because this is an unsupervised learning (clustering) operation. Hence I can't perform a partial_fit
	#Question2 : WRT Question 1, What should I be passing into the clustering algorithm. I would first have to incrementally accumulate data in the vectorizer


	print "## counts: " + str(X_transform_counts.shape) #<== I wont know the document class in advance for a clustering opeation

	# for doc in doc_list:
	# print doc

	print vectorizer