Skip to content

Instantly share code, notes, and snippets.

@abevoelker
Last active August 29, 2015 13:59
Show Gist options
  • Save abevoelker/10606926 to your computer and use it in GitHub Desktop.
Save abevoelker/10606926 to your computer and use it in GitHub Desktop.
scrapy Sidekiq pipeline (resque support shouldn't be too hard)
# project_name/spiders/foo_spider.py
from scrapy.contrib.spiders import CrawlSpider
class FooSpider(CrawlSpider):
post_process = dict(
klass = "Foo::Worker",
# queue = "default",
# retry = True,
)
# ...
# project_name/pipelines.py
from scrapy import signals
from scrapy.exceptions import DropItem
from scrapy.utils.serialize import ScrapyJSONEncoder
import redis
import json as simplejson
import settings
import os
class RedisPipeline(object):
"""Put items into Redis to be processed by Sidekiq (Ruby) workers"""
def __init__(self):
self.encoder = ScrapyJSONEncoder()
self.redis = redis.Redis.from_url(settings.REDIS_URL)
def open_spider(self, spider):
# apply default post-processing values if not set
spider.post_process.setdefault('queue', 'default')
spider.post_process.setdefault('retry', True)
# persist queue name to 'queues' Set so enqueued stats count is right
self.redis.sadd("queues", spider.post_process['queue'])
@classmethod
def sidekiq_queue(cls, queue):
return "queue:%s" % queue
@classmethod
def resque_queue(cls, queue):
return "resque:queue:%s" % queue
def process_item(self, item, spider):
queue = self.sidekiq_queue(spider.post_process['queue'])
klass = spider.post_process['klass']
jid = os.urandom(12).encode('hex')
retry = spider.post_process['retry']
data = self.encoder.encode({'class':klass, 'args':[item], 'jid':jid, 'retry':retry})
self.redis.rpush(queue, data)
return item
# project_name/settings.py
import os
BOT_NAME = 'project_name'
SPIDER_MODULES = ['project_name.spiders']
NEWSPIDER_MODULE = 'project_name.spiders'
ITEM_PIPELINES = [
'project_name.pipelines.RedisPipeline',
]
try:
REDIS_URL = os.environ['REDIS_URL']
except KeyError:
REDIS_URL = 'redis://localhost'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment