Skip to content

Instantly share code, notes, and snippets.

@shadiakiki1986
Last active October 13, 2022 18:22
Show Gist options
  • Save shadiakiki1986/0b2e25cf2e458ccea2b158359b2834a1 to your computer and use it in GitHub Desktop.
Save shadiakiki1986/0b2e25cf2e458ccea2b158359b2834a1 to your computer and use it in GitHub Desktop.
Run a Scrapy spider in a Celery Task (in django)

My first shot at fixing this was in tasks.py file below.

But then that just gave a "unhandled error in deferred", so I went on to use CrawlRunner.

That showed no output at all anymore, and didn't run as expected.

Eventually, I just settled on CELERY_WORKER_MAX_TASKS_PER_CHILD=1=1 in settings.py

Note: CELERY_WORKER_MAX_TASKS_PER_CHILD=1 is for django. Celery without django probably drops the CELERY_ prefix

# Utility class from https://stackoverflow.com/a/22202877/4126114
# because twister.reactor gives error "not restartable"
#
# Modified for celery==4.1.0 Scrapy==1.5.0 billiard==3.5.0.3
from billiard import Process
from scrapy import signals as scrapy_signals
from twisted.internet import reactor
from scrapy.crawler import Crawler
class UrlCrawlerScript(Process):
def __init__(self, spider):
Process.__init__(self)
self.crawler = Crawler(
spider,
settings={
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'ITEM_PIPELINES': {
# ...
}
}
)
self.crawler.signals.connect(reactor.stop, signal=scrapy_signals.spider_closed)
self.spider = spider
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
reactor.run()
def run_spider(url):
spider = CrJusticeGovLbSpiderDjango(url)
crawler = UrlCrawlerScript(spider)
# the script will block here until the crawling is finished
crawler.start()
crawler.join()
# as above, but using CrawlRunner as documented at
# https://doc.scrapy.org/en/latest/topics/practices.html
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
def run_url():
spider_settings = {
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'ITEM_PIPELINES': {
# no need for the original pipeline since this local one inherits from it
'bsec_compliance_kyc_import.scrapy_django_pipeline.DjangoPipeline': 400,
}
}
runner = CrawlerRunner(settings=spider_settings)
spider = CrJusticeGovLbSpiderDjango(scrape_instance=instance, df_in=df_in)
d = runner.crawl(spider)
# keep reactor running
# d.addBoth(lambda _: reactor.stop())
# the script will block here until the crawling is finished
if not reactor.running:
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment