-
-
Save juanriaza/8c8cdac8ec311711c921 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import absolute_import | |
import os | |
from scrapy.contrib.httpcache import FilesystemCacheStorage | |
from .dupefilter import splash_requst_fingerprint | |
class SplashAwareFSCacheStorage(FilesystemCacheStorage): | |
def _get_request_path(self, spider, request): | |
key = splash_requst_fingerprint(request) | |
return os.path.join(self.cachedir, spider.name, key[0:2], key) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import absolute_import | |
import hashlib | |
from scrapy.dupefilter import RFPDupeFilter | |
from scrapy.utils.request import request_fingerprint | |
def splash_requst_fingerprint(request, include_headers=None): | |
""" Request fingerprint that takes 'splash' meta key into account """ | |
fp = request_fingerprint(request, include_headers=include_headers) | |
if 'splash' not in request.meta: | |
return fp | |
h = hashlib.sha1(fp) | |
for key, value in sorted(request.meta['splash'].items()): | |
h.update(key) | |
h.update(str(value)) | |
return h.hexdigest() | |
class SplashAwareDupeFilter(RFPDupeFilter): | |
""" | |
DupeFilter that takes 'splash' meta key in account. | |
It should be used with SplashMiddleware. | |
""" | |
def request_fingerprint(self, request): | |
return splash_requst_fingerprint(request) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import absolute_import | |
import logging | |
from urlparse import urljoin | |
from urllib import urlencode | |
import scrapy | |
from scrapy import log | |
class SplashMiddleware(object): | |
""" | |
Scrapy downloader middleware that passes requests through Splash_ | |
when 'splash' Request.meta key is set. | |
To enable the middleware add it to settings:: | |
DOWNLOADER_MIDDLEWARES = { | |
'splash_mw.SplashMiddleware': 950, | |
} | |
and then use ``splash`` meta key to pass options:: | |
yield Request(url, self.parse_result, meta={'splash': { | |
# use render.json options here | |
'html': 1, | |
'png': 1, | |
}} | |
The response | |
.. _Splash: https://github.com/scrapinghub/splash | |
""" | |
DEFAULT_SPLASH_URL = 'http://127.0.0.1:8050' | |
SPLASH_EXTRA_TIMEOUT = 10 | |
RESPECT_SLOTS = True | |
def __init__(self, crawler, splash_url): | |
self.crawler = crawler | |
self._splash_url = splash_url | |
@classmethod | |
def from_crawler(cls, crawler): | |
url = crawler.settings.get('SPLASH_URL', cls.DEFAULT_SPLASH_URL) | |
return cls(crawler, url) | |
def splash_url(self, query, url, endpoint='render.json'): | |
query = query.copy() | |
query['url'] = url | |
return urljoin(self._splash_url, endpoint) + '?' + urlencode(query) | |
def process_request(self, request, spider): | |
splash_options = request.meta.get('splash') | |
if not splash_options: | |
return | |
if request.method != 'GET': | |
log.msg("Only GET requests are supported by SplashMiddleware; %s will be handled without Splash" % request, logging.WARNING) | |
return request | |
for key, value in splash_options.items(): | |
if key.lower() == 'timeout': | |
request.meta['download_timeout'] = max( | |
request.meta.get('download_timeout', 1e6), | |
float(value) + self.SPLASH_EXTRA_TIMEOUT | |
) | |
meta = request.meta.copy() | |
del meta['splash'] | |
meta['_splash'] = True | |
if self.RESPECT_SLOTS: | |
# Use the same download slot to (sort of) respect download | |
# delays and concurrency options. | |
meta['download_slot'] = self._get_slot_key(request) | |
self.crawler.stats.inc_value('splash/request_count') | |
req_rep = request.replace( | |
url=self.splash_url(splash_options, request.url), | |
meta=meta, | |
# FIXME: original HTTP headers are not respected. | |
# To respect them changes to Splash are needed. | |
headers={}, | |
) | |
return req_rep | |
def process_response(self, request, response, spider): | |
if '_splash' in request.meta: | |
self.crawler.stats.inc_value('splash/response_count/%s' % response.status) | |
return response | |
def _get_slot_key(self, request_or_response): | |
return self.crawler.engine.downloader._get_slot_key(request_or_response, None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment