Skip to content

Instantly share code, notes, and snippets.

@dangra
Created December 31, 2012 15:11
Show Gist options
  • Save dangra/4420549 to your computer and use it in GitHub Desktop.
Save dangra/4420549 to your computer and use it in GitHub Desktop.
import os
from os.path import join, exists
from time import time
import cPickle as pickle
from w3lib.http import headers_dict_to_raw, headers_raw_to_dict
from scrapy import signals
from scrapy.http import Headers
from scrapy.http.request import Request
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.responsetypes import responsetypes
from scrapy.utils.request import request_fingerprint
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import load_object
from scrapy.utils.project import data_path
class DummyPolicy(object):
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = map(int, settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
def should_cache_request(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
def should_cache_response(self, response, request):
return response.status not in self.ignore_http_codes
def is_cached_response_fresh(self, response, request):
return True
def is_cached_response_valid(self, cached_response, response, request):
return True
class RFC2616Policy(DummyPolicy):
def __init__(self, settings):
super(RFC2616Policy, self).__init__(settings)
def should_cache_response(self, response):
retval = super(RFC2616Policy, self).should_cache_response(response)
if response.headers.has_key('cache-control'):
retval = retval and (response.headers['cache-control'].lower().find('no-store') == -1)
#retval = retval and self.policy_response(response)
return retval
def should_cache_request(self, request):
retval = super(RFC2616Policy, self).should_cache_request(request)
if request.headers.has_key('cache-control'):
retval = retval and (request.headers['cache-control'].lower().find('no-store') == -1)
#retval = retval and self.policy_request(request)
return retval
class Policy(object):
def should_cache_request(self, request):
pass
def should_cache_response(self, response, request):
pass
def is_cached_response_fresh(self, response, request):
pass
def is_cached_response_valid(self, cached_response, response, request):
pass
class HttpCacheMiddleware(object):
def __init__(self, settings, stats):
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider):
self.storage.open_spider(spider)
def spider_closed(self, spider):
self.storage.close_spider(spider)
def process_request(self, request, spider):
if 'dont_cache' in request.meta or \
not self.policy.should_cache_request(request):
request.meta['dont_cache'] = True
return
response = self.storage.retrieve_response(spider, request)
if response is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return # first time request
# Expiration check
if self.policy.is_cached_response_fresh(response, request):
self.stats.inc_value('httpcache/hit', spider=spider)
response.flags.append('cached')
return response
request.meta['cached_response'] = response
def process_response(self, request, response, spider):
if 'cached' in response.flags or 'dont_cache' in request.meta:
return response
# Do not validate first-hand responses
cached_response = request.meta.pop('cached_response', None)
if cached_response is None:
self.stats.inc_value('httpcache/firsthand', spider=spider)
self._cache_response(spider, response, request, cached_response)
return response
if self.policy.is_cached_response_valid(cached_response, response, request):
self.stats.inc_value('httpcache/revalidate', spider=spider)
return cached_response
self.stats.inc_value('httpcache/invalidate', spider=spider)
self.storage.invalidate_response(spider, cached_response, request)
self._cache_response(spider, response, request, cached_response)
return response
def _cache_response(self, spider, response, request, cached_response):
if self.policy.should_cache_response(response, request):
self.stats.inc_value('httpcache/store', spider=spider)
self.storage.store_response(spider, request, response)
else:
self.stats.inc_value('httpcache/uncacheable', spider=spider)
class FilesystemCacheStorage(object):
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
def open_spider(self, spider):
pass
def close_spider(self, spider):
pass
def retrieve_response(self, spider, request):
"""Return response if present in cache, or None otherwise."""
metadata = self._read_meta(spider, request)
if metadata is None:
return # not cached
rpath = self._get_request_path(spider, request)
with open(join(rpath, 'response_body'), 'rb') as f:
body = f.read()
with open(join(rpath, 'response_headers'), 'rb') as f:
rawheaders = f.read()
url = metadata.get('response_url')
status = metadata['status']
headers = Headers(headers_raw_to_dict(rawheaders))
respcls = responsetypes.from_args(headers=headers, url=url)
response = respcls(url=url, headers=headers, status=status, body=body)
return response
def store_response(self, spider, request, response):
"""Store the given response in the cache."""
rpath = self._get_request_path(spider, request)
if not exists(rpath):
os.makedirs(rpath)
metadata = {
'url': request.url,
'method': request.method,
'status': response.status,
'response_url': response.url,
'timestamp': time(),
}
with open(join(rpath, 'meta'), 'wb') as f:
f.write(repr(metadata))
with open(join(rpath, 'pickled_meta'), 'wb') as f:
pickle.dump(metadata, f, protocol=2)
with open(join(rpath, 'response_headers'), 'wb') as f:
f.write(headers_dict_to_raw(response.headers))
with open(join(rpath, 'response_body'), 'wb') as f:
f.write(response.body)
with open(join(rpath, 'request_headers'), 'wb') as f:
f.write(headers_dict_to_raw(request.headers))
with open(join(rpath, 'request_body'), 'wb') as f:
f.write(request.body)
def _get_request_path(self, spider, request):
key = request_fingerprint(request)
return join(self.cachedir, spider.name, key[0:2], key)
def _read_meta(self, spider, request):
rpath = self._get_request_path(spider, request)
metapath = join(rpath, 'pickled_meta')
if not exists(metapath):
return # not found
mtime = os.stat(rpath).st_mtime
if 0 < self.expiration_secs < time() - mtime:
return # expired
with open(metapath, 'rb') as f:
return pickle.load(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment