Last active
July 7, 2016 14:09
-
-
Save ziplokk1/76a3f4d454403a745b0773e6ea3cf3be to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This middleware will allow you to specify status codes to retry after a specified wait timeout period. | |
""" | |
import time | |
from scrapy.contrib.downloadermiddleware.retry import RetryMiddleware | |
from scrapy.utils.response import response_status_message | |
class PauseOnStatusMiddleware(RetryMiddleware): | |
""" | |
Config: | |
# settings.py | |
TIMEOUT_STATUS_CODES = [503] # or whatever status codes you'd like to retry after a specified amount of time | |
RETRY_TIMEOUT = 2.5 # optional. Default is 2.5. | |
# Set the middleware index to 501 which comes right after the retry middleware. | |
# This way when a response is received, the PauseOnStatusMiddleware intercepts the response before | |
# the base retry middleware. | |
DOWNLOADER_MIDDLEWARES = { | |
'your_module.middleware.PauseOnStatusMiddleware': 501 | |
} | |
""" | |
TIMEOUT = 2.5 | |
def __init__(self, settings): | |
RetryMiddleware.__init__(self, settings) | |
self.timeout_status_codes = set(int(x) for x in settings.getlist('TIMEOUT_STATUS_CODES')) | |
self.TIMEOUT = settings.get('RETRY_TIMEOUT', self.TIMEOUT) | |
def process_response(self, request, response, spider): | |
if request.meta.get('dont_retry', False): | |
return response | |
if response.status in self.timeout_status_codes: | |
request.meta['cookiejar'] = request.meta.get('cookiejar', 0) + 1 | |
time.sleep(self.TIMEOUT) | |
reason = response_status_message(response.status) | |
return self._retry(request, reason, spider) or response | |
return response |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment