Last active
December 21, 2017 05:17
-
-
Save kmike/9819115 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Folder structure should be the following: | |
vectorizers/ | |
vec/ | |
stored/ | |
string_dict/ | |
string_dict.pyx | |
setup.py | |
marisa_vectorizers.py | |
memusage_fit.py | |
memusage_transform.py | |
Run setup.py install from the string_dict folder, | |
then run memusage_fit.py with one of the options, | |
then run memusage_transform.py with the same option. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import numpy as np | |
import scipy.sparse as sp | |
import marisa_trie | |
import hat_trie | |
# import datrie | |
# import chartrie | |
from string_dict import UnicodeIntDict, UnicodeIntDict2 | |
from sklearn.externals import six | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, _make_int_array | |
# hack to store vocabulary in MARISA Trie | |
class _MarisaVocabularyMixin(object): | |
def fit_transform(self, raw_documents, y=None): | |
super(_MarisaVocabularyMixin, self).fit_transform(raw_documents) | |
self._freeze_vocabulary() | |
return super(_MarisaVocabularyMixin, self).fit_transform(raw_documents, y) | |
def _freeze_vocabulary(self): | |
if not self.fixed_vocabulary: | |
self.vocabulary_ = marisa_trie.Trie(self.vocabulary_.keys()) | |
self.fixed_vocabulary = True | |
del self.stop_words_ | |
class MarisaCountVectorizerOld(_MarisaVocabularyMixin, CountVectorizer): | |
pass | |
class ReducedCountVectorizer(CountVectorizer): | |
def _sort_features(self, X, vocabulary): | |
return X | |
def _limit_features(self, X, vocabulary, high=None, low=None, | |
limit=None): | |
return X, set() | |
class _TrieCountVectorizer(ReducedCountVectorizer): | |
trie_cls = None | |
def _count_vocab(self, raw_documents, fixed_vocab): | |
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False | |
""" | |
if fixed_vocab: | |
raise NotImplementedError() | |
vocabulary = self.trie_cls() | |
analyze = self.build_analyzer() | |
j_indices = _make_int_array() | |
indptr = _make_int_array() | |
indptr.append(0) | |
for doc in raw_documents: | |
for feature in analyze(doc): | |
if feature not in vocabulary: | |
idx = len(vocabulary) | |
vocabulary[feature] = idx | |
j_indices.append(idx) | |
else: | |
try: | |
j_indices.append(vocabulary[feature]) | |
except KeyError: | |
# Ignore out-of-vocabulary items for fixed_vocab=True | |
continue | |
indptr.append(len(j_indices)) | |
# some Python/Scipy versions won't accept an array.array: | |
if j_indices: | |
j_indices = np.frombuffer(j_indices, dtype=np.intc) | |
else: | |
j_indices = np.array([], dtype=np.int32) | |
indptr = np.frombuffer(indptr, dtype=np.intc) | |
values = np.ones(len(j_indices)) | |
X = sp.csr_matrix((values, j_indices, indptr), | |
shape=(len(indptr) - 1, len(vocabulary)), | |
dtype=self.dtype) | |
X.sum_duplicates() | |
return vocabulary, X | |
def _sort_features(self, X, vocabulary): | |
return X | |
def _limit_features(self, X, vocabulary, high=None, low=None, | |
limit=None): | |
return X, set() | |
class HatTrieCountVectorizer(_TrieCountVectorizer): | |
trie_cls = hat_trie.Trie | |
# class DatrieCountVectorizer(_TrieCountVectorizer): # it segfaults | |
# trie_cls = lambda *args: datrie.Trie(ranges=[(chr(1), chr(255))]) | |
# class ChartrieCountVectorizer(_TrieCountVectorizer): # can't get it work | |
# trie_cls = chartrie.CharTrie | |
class StdCountVectorizer(_TrieCountVectorizer): # this is not a trie, I know | |
trie_cls = UnicodeIntDict2 | |
class MarisaCountVectorizer(CountVectorizer): | |
# ``CountVectorizer.fit`` method calls ``fit_transform`` so | |
# ``fit`` is not provided | |
def fit_transform(self, raw_documents, y=None): | |
X = super(MarisaCountVectorizer, self).fit_transform(raw_documents) | |
X = self._freeze_vocabulary(X) | |
return X | |
def _freeze_vocabulary(self, X=None): | |
if not self.fixed_vocabulary: | |
frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_)) | |
if X is not None: | |
X = self._reorder_features(X, self.vocabulary_, frozen) | |
self.vocabulary_ = frozen | |
self.fixed_vocabulary = True | |
del self.stop_words_ | |
return X | |
def _reorder_features(self, X, old_vocabulary, new_vocabulary): | |
map_index = np.empty(len(old_vocabulary), dtype=np.int32) | |
for term, new_val in six.iteritems(new_vocabulary): | |
map_index[new_val] = old_vocabulary[term] | |
return X[:, map_index] | |
class MarisaTfidfVectorizer(TfidfVectorizer): | |
def fit_transform(self, raw_documents, y=None): | |
super(MarisaTfidfVectorizer, self).fit_transform(raw_documents) | |
self._freeze_vocabulary() | |
return super(MarisaTfidfVectorizer, self).fit_transform(raw_documents, y) | |
def fit(self, raw_documents, y=None): | |
super(MarisaTfidfVectorizer, self).fit(raw_documents) | |
self._freeze_vocabulary() | |
return super(MarisaTfidfVectorizer, self).fit(raw_documents, y) | |
def _freeze_vocabulary(self, X=None): | |
if not self.fixed_vocabulary: | |
self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_)) | |
self.fixed_vocabulary = True | |
del self.stop_words_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import division, print_function | |
import os | |
import sys | |
import time | |
import resource | |
import psutil | |
import gc | |
from sklearn import datasets | |
from sklearn.externals import joblib | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer | |
from marisa_vectorizers import ( | |
MarisaCountVectorizer, | |
MarisaTfidfVectorizer, | |
MarisaCountVectorizerOld, | |
HatTrieCountVectorizer, | |
# DatrieCountVectorizer, | |
# ChartrieCountVectorizer, | |
StdCountVectorizer, | |
ReducedCountVectorizer, | |
) | |
vectorizers = dict( | |
count = CountVectorizer(), | |
count2 = CountVectorizer(ngram_range=(1,2)), | |
count3 = CountVectorizer(ngram_range=(1,3)), | |
count4 = CountVectorizer(ngram_range=(1,4)), | |
rcount = ReducedCountVectorizer(), | |
rcount2 = ReducedCountVectorizer(ngram_range=(1,2)), | |
rcount3 = ReducedCountVectorizer(ngram_range=(1,3)), | |
rcount4 = ReducedCountVectorizer(ngram_range=(1,4)), | |
tfidf = TfidfVectorizer(), | |
tfidf2 = TfidfVectorizer(ngram_range=(1,2)), | |
hashing18 = HashingVectorizer(n_features=2**18), | |
hashing20 = HashingVectorizer(n_features=2**20), | |
marisa_count = MarisaCountVectorizer(), | |
marisa_count2 = MarisaCountVectorizer(ngram_range=(1,2)), | |
marisa_count3 = MarisaCountVectorizer(ngram_range=(1,3)), | |
marisa_count4 = MarisaCountVectorizer(ngram_range=(1,4)), | |
marisa_count_old = MarisaCountVectorizerOld(), | |
marisa_count2_old = MarisaCountVectorizerOld(ngram_range=(1,2)), | |
marisa_tfidf = MarisaTfidfVectorizer(), | |
marisa_tfidf2 = MarisaTfidfVectorizer(ngram_range=(1,2)), | |
hattrie_count = HatTrieCountVectorizer(), | |
hattrie_count2 = HatTrieCountVectorizer(ngram_range=(1,2)), | |
hattrie_count3 = HatTrieCountVectorizer(ngram_range=(1,3)), | |
hattrie_count4 = HatTrieCountVectorizer(ngram_range=(1,4)), | |
# datrie_count = DatrieCountVectorizer(), | |
# datrie_count2 = DatrieCountVectorizer(ngram_range=(1,2)), | |
# chartrie_count = ChartrieCountVectorizer(), | |
# chartrie_count2 = ChartrieCountVectorizer(ngram_range=(1,2)), | |
std_count = StdCountVectorizer(), | |
std_count2 = StdCountVectorizer(ngram_range=(1,2)), | |
std_count3 = StdCountVectorizer(ngram_range=(1,3)), | |
std_count4 = StdCountVectorizer(ngram_range=(1,4)), | |
) | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print("Available vectorizers:\n") | |
print("\n".join(sorted(vectorizers.keys()))) | |
sys.exit() | |
vecname = sys.argv[1] | |
vec = vectorizers[vecname] | |
newsgroups_train = datasets.fetch_20newsgroups(subset='train') | |
p = psutil.Process(os.getpid()) | |
before = p.get_memory_info().rss / 2**20 | |
max_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20 | |
start = time.time() | |
vec.fit(newsgroups_train.data) | |
end = time.time() | |
after = p.get_memory_info().rss / 2**20 | |
max_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20 | |
assert max_after >= max_before | |
print("fit time: %0.1fs" % (end-start)) | |
print("fit memory usage: %0.1fMB" % (max_after-before)) | |
before2 = p.get_memory_info().rss / 2**20 | |
joblib.dump(vec, os.path.join("vec", vecname+".joblib")) | |
max_after2 = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20 | |
assert max_after2 >= max_after | |
print("dump time: %0.1fs" % (time.time()-end)) | |
print("dump memory usage: %0.1fMB" % (max_after2-before2)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import division, print_function | |
import os | |
import sys | |
import time | |
import psutil | |
from sklearn.externals import joblib | |
from sklearn import datasets | |
# the following imports are not needed, but if we won't import them | |
# memory usage numbers will include memory required for loading these modules | |
import array | |
import cPickle | |
from collections import defaultdict | |
import numpy | |
import scipy.sparse | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer | |
from sklearn.feature_extraction import DictVectorizer | |
def _transform(vec, data): | |
# XXX: this code leaks memory: | |
X = vec.transform(data) | |
# XXX: and this code doesn't leak memory - why? | |
# for doc in data: | |
# X = vec.transform([doc]) | |
return X.shape[1] | |
if __name__ == '__main__': | |
vecname = sys.argv[1] | |
fname = os.path.join('vec', vecname+'.joblib') | |
newsgroups_test = datasets.fetch_20newsgroups(subset='test') | |
p = psutil.Process(os.getpid()) | |
before = p.get_memory_info().rss / 2**20 | |
start_load = time.time() | |
vec = joblib.load(fname) | |
end_load = time.time() | |
after_load = p.get_memory_info().rss / 2**20 | |
start_transform = time.time() | |
n_features = _transform(vec, newsgroups_test.data) | |
end_transform = time.time() | |
print("transform features: %d" % n_features) | |
after_transform = p.get_memory_info().rss / 2**20 | |
print("load time: %0.1fs" % (end_load-start_load)) | |
print("load memory usage: %0.1fMB" % (after_load-before)) | |
print("transform time: %0.1fs" % (end_transform-start_transform)) | |
print("transform memory leak: %0.1fMB" % (after_transform-after_load)) | |
# print("total memory usage: %0.1fMB" % (after_transform-before)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from distutils.core import setup | |
from distutils.extension import Extension | |
from Cython.Distutils import build_ext | |
setup( | |
name='string_dict', | |
cmdclass = {'build_ext': build_ext}, | |
ext_modules = [Extension("string_dict", ["string_dict.pyx"], language='c++')] | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from cython.operator cimport dereference as deref | |
from libcpp.string cimport string as cpp_string | |
from libcpp.map cimport map as cpp_map | |
from libcpp.utility cimport pair | |
cdef extern from "<unordered_map>" namespace "std": | |
cdef cppclass unordered_map[T, U]: | |
cppclass iterator: | |
pair[T,U]& operator*() | |
iterator operator++() | |
iterator operator--() | |
bint operator==(iterator) | |
bint operator!=(iterator) | |
cppclass reverse_iterator: | |
pair[T,U]& operator*() | |
iterator operator++() | |
iterator operator--() | |
bint operator==(reverse_iterator) | |
bint operator!=(reverse_iterator) | |
#cppclass const_iterator(iterator): | |
# pass | |
#cppclass const_reverse_iterator(reverse_iterator): | |
# pass | |
unordered_map() | |
unordered_map(unordered_map&) | |
#unordered_map(key_compare&) | |
U& operator[](T&) | |
#unordered_map& operator=(unordered_map&) | |
bint operator==(unordered_map&, unordered_map&) | |
bint operator!=(unordered_map&, unordered_map&) | |
bint operator<(unordered_map&, unordered_map&) | |
bint operator>(unordered_map&, unordered_map&) | |
bint operator<=(unordered_map&, unordered_map&) | |
bint operator>=(unordered_map&, unordered_map&) | |
U& at(T&) | |
iterator begin() | |
#const_iterator begin() | |
void clear() | |
size_t count(T&) | |
bint empty() | |
iterator end() | |
#const_iterator end() | |
pair[iterator, iterator] equal_range(T&) | |
#pair[const_iterator, const_iterator] equal_range(key_type&) | |
void erase(iterator) | |
void erase(iterator, iterator) | |
size_t erase(T&) | |
iterator find(T&) | |
#const_iterator find(key_type&) | |
pair[iterator, bint] insert(pair[T,U]) # XXX pair[T,U]& | |
iterator insert(iterator, pair[T,U]) # XXX pair[T,U]& | |
#void insert(input_iterator, input_iterator) | |
#key_compare key_comp() | |
iterator lower_bound(T&) | |
#const_iterator lower_bound(key_type&) | |
size_t max_size() | |
reverse_iterator rbegin() | |
#const_reverse_iterator rbegin() | |
reverse_iterator rend() | |
#const_reverse_iterator rend() | |
size_t size() | |
void swap(unordered_map&) | |
iterator upper_bound(T&) | |
#const_iterator upper_bound(key_type&) | |
#value_compare value_comp() | |
cdef class UnicodeIntDict2: | |
cdef unordered_map[cpp_string, int] _map | |
def __init__(self): | |
self._map = unordered_map[cpp_string, int]() | |
def __setitem__(self, unicode key, int value): | |
self._map[key.encode('utf8')] = value | |
def __getitem__(self, unicode key): | |
cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8')) | |
if it == self._map.end(): | |
raise KeyError(key) | |
return deref(it).second | |
def __contains__(self, unicode key): | |
cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8')) | |
return not it == self._map.end() | |
def __len__(self): | |
return self._map.size() | |
cdef class UnicodeIntDict: | |
cdef cpp_map[cpp_string, int] _map | |
def __init__(self): | |
self._map = cpp_map[cpp_string, int]() | |
def __setitem__(self, unicode key, int value): | |
self._map[key.encode('utf8')] = value | |
def __getitem__(self, unicode key): | |
cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8')) | |
if it == self._map.end(): | |
raise KeyError(key) | |
return deref(it).second | |
def __contains__(self, unicode key): | |
cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8')) | |
return not it == self._map.end() | |
def __len__(self): | |
return self._map.size() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment