Skip to content

Instantly share code, notes, and snippets.

@balamuru
Last active December 12, 2015 05:59
Show Gist options
  • Save balamuru/4726232 to your computer and use it in GitHub Desktop.
Save balamuru/4726232 to your computer and use it in GitHub Desktop.
scikit incremental vectorizer - debug
# Author: Peter Prettenhofer <[email protected]>
# Lars Buitinck <[email protected]>
# License: Simplified BSD
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import FeatureHasher
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
import os
import urllib
import logging
import tarfile
import pickle
import shutil
from os import environ
from os.path import dirname
from os.path import join
from os.path import exists
from os.path import expanduser
from os.path import isdir
from os import listdir
from os import makedirs
files_per_chunk = 10
def iter_documents(top_directory, max_files_per_chunk):
#Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.
# file_list = []
dict = {}
for root, dirs, files in os.walk(top_directory):
for file in filter(lambda file: file.endswith('.txt'), files):
document = open(os.path.join(root, file)).read() # read the entire document, as one big string
#handle document
# file_list.append(document)
dict[file] = document
# if len(file_list) >= max_files_per_chunk:
if len(dict) >= max_files_per_chunk:
# yield file_list
yield dict
# file_list[:] = []
dict = {}
# yield file_list
yield dict
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
## parse commandline arguments
op = OptionParser()
op.add_option("--no-minibatch",
action="store_false", dest="minibatch", default=True,
help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
action="store_false", dest="use_idf", default=True,
help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
action="store_true", default=False,
help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
help="Maximum number of features (dimensions)"
"to extract from text.")
print __doc__
op.print_help()
(opts, args) = op.parse_args()
if len(args) > 0:
op.error("this script takes no arguments.")
sys.exit(1)
# Uncomment the following to do the analysis on all the categories
#categories = None
#############################
vectorizer = HashingVectorizer(n_features=opts.n_features,
stop_words='english',
non_negative=False, norm='l2',
binary=False)
for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk):
# add the docs in chunks of size 'files_per_chunk'
X_transform_counts = vectorizer.transform(doc_dict.values())
#X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values())
#Question 1: I don't know class information because this is an unsupervised learning (clustering) operation. Hence I can't perform a partial_fit
#Question2 : WRT Question 1, What should I be passing into the clustering algorithm. I would first have to incrementally accumulate data in the vectorizer
print "## counts: " + str(X_transform_counts.shape) #<== I wont know the document class in advance for a clustering opeation
# for doc in doc_list:
# print doc
print vectorizer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment