Skip to content

Instantly share code, notes, and snippets.

@dvdbng
Created April 20, 2016 07:53
Show Gist options
  • Save dvdbng/c0f8a07fd812beca82e19c3164fd82c6 to your computer and use it in GitHub Desktop.
Save dvdbng/c0f8a07fd812beca82e19c3164fd82c6 to your computer and use it in GitHub Desktop.
Use scrapy and splash in the same process
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.settings import Settings
from scrapy.http import TextResponse
from twisted.internet import defer
from splash.browser_tab import BrowserTab
from splash.render_options import RenderOptions
from splash import defaults
import random
nm_factory = None
class BlogSpider(scrapy.Spider):
"""
The main spider class
"""
name = 'blogspider'
start_urls = ['https://blog.scrapinghub.com']
def parse(self, response):
for url in response.css('ul li a::attr("href")').re('.*/category/.*'):
yield scrapy.Request(response.urljoin(url), self.parse_titles)
def parse_titles(self, response):
for post_title in response.css('div.entries > ul > li a::text').extract():
yield {'title': post_title}
class SplashDownloadHandler(object):
"""
Download handler that creates a splash tab for each requested url
"""
def __init__(self, settings):
pass
def download_request(self, request, spider):
d = defer.Deferred()
tab = BrowserTab(
network_manager=nm_factory(),
splash_proxy_factory=None,
verbosity=3,
render_options=RenderOptions({
"uid": str(random.randint(0, 10**10))
}, defaults.MAX_TIMEOUT),
visible=False,
)
def callback(*args):
res = TextResponse(request.url, body=tab.html(), encoding='utf-8')
tab.close()
d.callback(res)
tab.go(request.url, callback=callback, errback=d.errback)
return d
# Custom settings
settings = Settings({
# Override the download handler
"DOWNLOAD_HANDLERS": {
'http': 'scrapy_spider.SplashDownloadHandler',
'https': 'scrapy_spider.SplashDownloadHandler',
}
})
def start_spider():
runner = CrawlerRunner(settings)
runner.crawl(BlogSpider)
crawler = list(runner.crawlers)[0]
#!/usr/bin/env python
import splash.server
def start_spider(**kwargs):
# We need to wait until splash has started to import any scrapy module
import scrapy_spider
scrapy_spider.nm_factory = kwargs['network_manager_factory']
scrapy_spider.start_spider()
if __name__ == '__main__':
splash.server.main(server_factory=start_spider, argv=[])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment