Skip to content

Instantly share code, notes, and snippets.

@pezon
Last active December 24, 2015 15:19
Show Gist options
  • Save pezon/6818792 to your computer and use it in GitHub Desktop.
Save pezon/6818792 to your computer and use it in GitHub Desktop.
ScrapyCommand is a Django management command that fires multiple scrapy spiders, each with different settings and parameters, i.e., if they need to support different pipelines. These settings are defined by a settings dictionary in django's config settings. ScrapyCommand can be extended by other commands to support separate scraping jobs. When t…
from __future__ import absolute_import
from django.core.management.base import BaseCommand
from django.conf import settings as django_settings
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import CrawlerSettings
from scrapy import log, signals
from scrapy.xlib.pydispatch import dispatcher
from project.apps.scrapers.spiders import spider_module
import requests
import json
class ScrapyCommand(BaseCommand):
spiders = [{
"class": spider_module.SpiderClass,
"settings": django_settings.SPIDER_SETTINGS,
"params": { }
}]
def __init__(self, *args, **kwargs):
super(BaseCommand, self).__init__(*args, **kwargs)
self.spider_items = {}
dispatcher.connect(self.stop_spider, signal=signals.spider_closed)
dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
def run_from_argv(self, argv):
self._argv = argv
self.execute()
def item_scraped(self, item, spider):
self.spider_items[spider] += 1
def stop_spider(self, spider):
log.msg("Spider %s closed: %s items found." % (spider.name, self.spider_items[spider]))
if not self.spider_items[spider]:
raise NoItemsException(spider.name)
del self.spider_items[spider]
if not self.spider_items:
reactor.stop()
def handle(self, *args, **options):
for spider in self.spiders:
settings = CrawlerSettings()
settings.overrides = spider['settings']
cls = spider['class']
params = spider['params']
spider = cls(**params)
self.spider_items[spider] = 0
crawler = Crawler(settings)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
class NoItemsException(Exception):
def __init__(self, spider):
self.spider = spider
#
# send alert or email from here
#
reactor.stop()
def __str__(self):
return repr(self.spider)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment