Last active
December 12, 2015 05:59
-
-
Save balamuru/4726232 to your computer and use it in GitHub Desktop.
scikit incremental vectorizer - debug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Peter Prettenhofer <[email protected]> | |
# Lars Buitinck <[email protected]> | |
# License: Simplified BSD | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.feature_extraction.text import HashingVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import FeatureHasher | |
from sklearn.pipeline import Pipeline | |
from sklearn import metrics | |
from sklearn.cluster import KMeans, MiniBatchKMeans | |
import logging | |
from optparse import OptionParser | |
import sys | |
from time import time | |
import numpy as np | |
import os | |
import urllib | |
import logging | |
import tarfile | |
import pickle | |
import shutil | |
from os import environ | |
from os.path import dirname | |
from os.path import join | |
from os.path import exists | |
from os.path import expanduser | |
from os.path import isdir | |
from os import listdir | |
from os import makedirs | |
files_per_chunk = 10 | |
def iter_documents(top_directory, max_files_per_chunk): | |
#Iterate over all documents, yielding a document (=list of utf8 tokens) at a time. | |
# file_list = [] | |
dict = {} | |
for root, dirs, files in os.walk(top_directory): | |
for file in filter(lambda file: file.endswith('.txt'), files): | |
document = open(os.path.join(root, file)).read() # read the entire document, as one big string | |
#handle document | |
# file_list.append(document) | |
dict[file] = document | |
# if len(file_list) >= max_files_per_chunk: | |
if len(dict) >= max_files_per_chunk: | |
# yield file_list | |
yield dict | |
# file_list[:] = [] | |
dict = {} | |
# yield file_list | |
yield dict | |
# Display progress logs on stdout | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s %(levelname)s %(message)s') | |
## parse commandline arguments | |
op = OptionParser() | |
op.add_option("--no-minibatch", | |
action="store_false", dest="minibatch", default=True, | |
help="Use ordinary k-means algorithm (in batch mode).") | |
op.add_option("--no-idf", | |
action="store_false", dest="use_idf", default=True, | |
help="Disable Inverse Document Frequency feature weighting.") | |
op.add_option("--use-hashing", | |
action="store_true", default=False, | |
help="Use a hashing feature vectorizer") | |
op.add_option("--n-features", type=int, default=10000, | |
help="Maximum number of features (dimensions)" | |
"to extract from text.") | |
print __doc__ | |
op.print_help() | |
(opts, args) = op.parse_args() | |
if len(args) > 0: | |
op.error("this script takes no arguments.") | |
sys.exit(1) | |
# Uncomment the following to do the analysis on all the categories | |
#categories = None | |
############################# | |
vectorizer = HashingVectorizer(n_features=opts.n_features, | |
stop_words='english', | |
non_negative=False, norm='l2', | |
binary=False) | |
for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk): | |
# add the docs in chunks of size 'files_per_chunk' | |
X_transform_counts = vectorizer.transform(doc_dict.values()) | |
#X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values()) | |
#Question 1: I don't know class information because this is an unsupervised learning (clustering) operation. Hence I can't perform a partial_fit | |
#Question2 : WRT Question 1, What should I be passing into the clustering algorithm. I would first have to incrementally accumulate data in the vectorizer | |
print "## counts: " + str(X_transform_counts.shape) #<== I wont know the document class in advance for a clustering opeation | |
# for doc in doc_list: | |
# print doc | |
print vectorizer |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment