This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import absolute_import | |
import logging | |
import urlparse | |
from reppy.parser import Rules | |
from scrapy import log | |
from scrapy.exceptions import NotConfigured | |
from scrapy.http import Request |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from scrapy.http import safeurl | |
class Spider(scrapy.Spider): | |
name = 'loremipsum' | |
start_urls = ('https://www.lipsum.com',) | |
def parse(self, response): |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Folder structure should be the following: | |
vectorizers/ | |
vec/ | |
stored/ | |
string_dict/ | |
string_dict.pyx | |
setup.py | |
marisa_vectorizers.py | |
memusage_fit.py |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.sparse as sp | |
import hat_trie | |
from sklearn.feature_extraction.text import CountVectorizer, _make_int_array | |
class HatTrieCountVectorizer(CountVectorizer): | |
def _count_vocab(self, raw_documents, fixed_vocab): | |
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import marisa_trie | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.externals import six | |
class MarisaCountVectorizer(CountVectorizer): | |
# ``CountVectorizer.fit`` method calls ``fit_transform`` so | |
# ``fit`` is not provided | |
def fit_transform(self, raw_documents, y=None): |