Skip to content

Instantly share code, notes, and snippets.

@kmike
kmike / robots_mw.py
Last active February 21, 2017 10:47
RobotsCrawlDelayMiddleware
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import logging
import urlparse
from reppy.parser import Rules
from scrapy import log
from scrapy.exceptions import NotConfigured
from scrapy.http import Request
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@kmike
kmike / idealspider.py
Last active August 29, 2015 14:05 — forked from dangra/idealspider.py
import scrapy
from scrapy.http import safeurl
class Spider(scrapy.Spider):
name = 'loremipsum'
start_urls = ('https://www.lipsum.com',)
def parse(self, response):
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Folder structure should be the following:
vectorizers/
vec/
stored/
string_dict/
string_dict.pyx
setup.py
marisa_vectorizers.py
memusage_fit.py
import numpy as np
import scipy.sparse as sp
import hat_trie
from sklearn.feature_extraction.text import CountVectorizer, _make_int_array
class HatTrieCountVectorizer(CountVectorizer):
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
import numpy as np
import marisa_trie
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import six
class MarisaCountVectorizer(CountVectorizer):
# ``CountVectorizer.fit`` method calls ``fit_transform`` so
# ``fit`` is not provided
def fit_transform(self, raw_documents, y=None):