Last active
December 24, 2015 15:19
-
-
Save pezon/6818792 to your computer and use it in GitHub Desktop.
ScrapyCommand is a Django management command that fires multiple scrapy spiders, each with different settings and parameters, i.e., if they need to support different pipelines. These settings are defined by a settings dictionary in django's config settings. ScrapyCommand can be extended by other commands to support separate scraping jobs. When t…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import absolute_import | |
from django.core.management.base import BaseCommand | |
from django.conf import settings as django_settings | |
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy.settings import CrawlerSettings | |
from scrapy import log, signals | |
from scrapy.xlib.pydispatch import dispatcher | |
from project.apps.scrapers.spiders import spider_module | |
import requests | |
import json | |
class ScrapyCommand(BaseCommand): | |
spiders = [{ | |
"class": spider_module.SpiderClass, | |
"settings": django_settings.SPIDER_SETTINGS, | |
"params": { } | |
}] | |
def __init__(self, *args, **kwargs): | |
super(BaseCommand, self).__init__(*args, **kwargs) | |
self.spider_items = {} | |
dispatcher.connect(self.stop_spider, signal=signals.spider_closed) | |
dispatcher.connect(self.item_scraped, signal=signals.item_scraped) | |
def run_from_argv(self, argv): | |
self._argv = argv | |
self.execute() | |
def item_scraped(self, item, spider): | |
self.spider_items[spider] += 1 | |
def stop_spider(self, spider): | |
log.msg("Spider %s closed: %s items found." % (spider.name, self.spider_items[spider])) | |
if not self.spider_items[spider]: | |
raise NoItemsException(spider.name) | |
del self.spider_items[spider] | |
if not self.spider_items: | |
reactor.stop() | |
def handle(self, *args, **options): | |
for spider in self.spiders: | |
settings = CrawlerSettings() | |
settings.overrides = spider['settings'] | |
cls = spider['class'] | |
params = spider['params'] | |
spider = cls(**params) | |
self.spider_items[spider] = 0 | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.crawl(spider) | |
crawler.start() | |
log.start() | |
reactor.run() | |
class NoItemsException(Exception): | |
def __init__(self, spider): | |
self.spider = spider | |
# | |
# send alert or email from here | |
# | |
reactor.stop() | |
def __str__(self): | |
return repr(self.spider) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment