Created
March 27, 2014 11:21
-
-
Save kmike/9805389 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.sparse as sp | |
import hat_trie | |
from sklearn.feature_extraction.text import CountVectorizer, _make_int_array | |
class HatTrieCountVectorizer(CountVectorizer): | |
def _count_vocab(self, raw_documents, fixed_vocab): | |
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False | |
""" | |
if fixed_vocab: | |
raise NotImplementedError() | |
vocabulary = hat_trie.Trie() | |
analyze = self.build_analyzer() | |
j_indices = _make_int_array() | |
indptr = _make_int_array() | |
indptr.append(0) | |
for doc in raw_documents: | |
for feature in analyze(doc): | |
if feature not in vocabulary: | |
idx = len(vocabulary) | |
vocabulary[feature] = idx | |
j_indices.append(idx) | |
else: | |
try: | |
j_indices.append(vocabulary[feature]) | |
except KeyError: | |
# Ignore out-of-vocabulary items for fixed_vocab=True | |
continue | |
indptr.append(len(j_indices)) | |
# some Python/Scipy versions won't accept an array.array: | |
if j_indices: | |
j_indices = np.frombuffer(j_indices, dtype=np.intc) | |
else: | |
j_indices = np.array([], dtype=np.int32) | |
indptr = np.frombuffer(indptr, dtype=np.intc) | |
values = np.ones(len(j_indices)) | |
X = sp.csr_matrix((values, j_indices, indptr), | |
shape=(len(indptr) - 1, len(vocabulary)), | |
dtype=self.dtype) | |
X.sum_duplicates() | |
return vocabulary, X | |
def _sort_features(self, X, vocabulary): | |
return X | |
def _limit_features(self, X, vocabulary, high=None, low=None, | |
limit=None): | |
return X, set() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment