Skip to content

Instantly share code, notes, and snippets.

@MatthieuBizien
Created March 26, 2017 15:52
Show Gist options
  • Save MatthieuBizien/802b2ecac6beecaa1e14a7bd44f91c06 to your computer and use it in GitHub Desktop.
Save MatthieuBizien/802b2ecac6beecaa1e14a7bd44f91c06 to your computer and use it in GitHub Desktop.
class HashingTfIdfVectorizer:
"""Difference with HashingVectorizer: non_negative=True, norm=None, dtype=np.float32"""
def __init__(self, ngram_range=(1, 1), analyzer=u'word', n_features=1 << 21, min_df=1, sublinear_tf=False):
self.min_df = min_df
self.hasher = HashingVectorizer(non_negative=True, norm=None, dtype=np.float32,
ngram_range=ngram_range, analyzer=analyzer, n_features=n_features)
self.tfidf = TfidfTransformer(sublinear_tf=sublinear_tf)
def fit_transform(self, X, y=None):
X_hashed = self.hasher.fit_transform(X)
self.mask = np.array((X_hashed != 0).sum(axis=0)).flatten() >= self.min_df
X_masked = X_hashed[:, self.mask]
return self.tfidf.fit_transform(X_masked)
def fit(self, X, y=None):
self.fit_transform(X, y)
return self
def transform(self, X):
X_hashed = self.hasher.transform(X)
X_masked = X_hashed[:, self.mask]
return self.tfidf.transform(X_masked)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment