Last active
June 7, 2019 06:40
-
-
Save jbinfo/7e3bac6038fb618ad249 to your computer and use it in GitHub Desktop.
CloseSpider is a Scrapy extension that force spider to be closed after it reach a drop items limit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# MIT License (c) Lhassan Baazzi <[email protected]> | |
from collections import defaultdict | |
from twisted.internet import reactor | |
from scrapy import signals | |
class CloseSpider(object): | |
def __init__(self, crawler): | |
self.crawler = crawler | |
self.close_on = { | |
'drop_item_count': crawler.settings.getint('CLOSESPIDER_DROP_ITEM_COUNT'), | |
} | |
self.counter = defaultdict(int) | |
if self.close_on.get('drop_item_count'): | |
crawler.signals.connect(self.drop_item_count, signal=signals.item_dropped) | |
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) | |
@classmethod | |
def from_crawler(cls, crawler): | |
return cls(crawler) | |
def drop_item_count(self, item, response, exception, spider): | |
self.counter['drop_item_count'] += 1 | |
if self.counter['drop_item_count'] == self.close_on['drop_item_count']: | |
self.crawler.engine.close_spider(spider, 'closespider_drop_item_count') | |
def spider_closed(self, spider): | |
task = getattr(self, 'task', False) | |
if task and task.active(): | |
task.cancel() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ... | |
# ... | |
# ... | |
# After 10 drops the spider will be closed | |
CLOSESPIDER_DROP_ITEM_COUNT = 10 | |
# Replace project_name by name of your scrapy project, I put my closespider.py extension under extensions folder | |
EXTENSIONS = { | |
# ... | |
'project_name.extensions.closespider.CloseSpider': 500, | |
# ... | |
} | |
# ... | |
# ... | |
# ... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment