dvdbng · April 20, 2016 07:53
diff --git a/scrapy_spider.py b/scrapy_spider.py
 import scrapy
 from scrapy.crawler import CrawlerRunner
 from scrapy.settings import Settings
 from scrapy.http import TextResponse
 from twisted.internet import defer
 from splash.browser_tab import BrowserTab
 from splash.render_options import RenderOptions
 from splash import defaults
 import random

 nm_factory = None

 class BlogSpider(scrapy.Spider):
    """
    The main spider class
    """
    name = 'blogspider'
    start_urls = ['https://blog.scrapinghub.com']

    def parse(self, response):
        for url in response.css('ul li a::attr("href")').re('.*/category/.*'):
            yield scrapy.Request(response.urljoin(url), self.parse_titles)

    def parse_titles(self, response):
        for post_title in response.css('div.entries > ul > li a::text').extract():
            yield {'title': post_title}


 class SplashDownloadHandler(object):
    """
    Download handler that creates a splash tab for each requested url
    """
    def __init__(self, settings):
        pass

    def download_request(self, request, spider):
        d = defer.Deferred()
        tab = BrowserTab(
            network_manager=nm_factory(),
            splash_proxy_factory=None,
            verbosity=3,
            render_options=RenderOptions({
                "uid": str(random.randint(0, 10**10))
            }, defaults.MAX_TIMEOUT),
            visible=False,
        )

        def callback(*args):
            res = TextResponse(request.url, body=tab.html(), encoding='utf-8')
            tab.close()
            d.callback(res)

        tab.go(request.url, callback=callback, errback=d.errback)
        return d

 # Custom settings
 settings = Settings({
    # Override the download handler
    "DOWNLOAD_HANDLERS": {
        'http': 'scrapy_spider.SplashDownloadHandler',
        'https': 'scrapy_spider.SplashDownloadHandler',
    }
 })

 def start_spider():
    runner = CrawlerRunner(settings)
    runner.crawl(BlogSpider)
    crawler = list(runner.crawlers)[0]

diff --git a/splash_with_scrapy.py b/splash_with_scrapy.py
 #!/usr/bin/env python

 import splash.server

 def start_spider(**kwargs):
    # We need to wait until splash has started to import any scrapy module
    import scrapy_spider
    scrapy_spider.nm_factory = kwargs['network_manager_factory']
    scrapy_spider.start_spider()

 if __name__ == '__main__':
    splash.server.main(server_factory=start_spider, argv=[])
	import scrapy
	from scrapy.crawler import CrawlerRunner
	from scrapy.settings import Settings
	from scrapy.http import TextResponse
	from twisted.internet import defer
	from splash.browser_tab import BrowserTab
	from splash.render_options import RenderOptions
	from splash import defaults
	import random

	nm_factory = None

	class BlogSpider(scrapy.Spider):
	"""
	The main spider class
	"""
	name = 'blogspider'
	start_urls = ['https://blog.scrapinghub.com']

	def parse(self, response):
	for url in response.css('ul li a::attr("href")').re('./category/.'):
	yield scrapy.Request(response.urljoin(url), self.parse_titles)

	def parse_titles(self, response):
	for post_title in response.css('div.entries > ul > li a::text').extract():
	yield {'title': post_title}


	class SplashDownloadHandler(object):
	"""
	Download handler that creates a splash tab for each requested url
	"""
	def __init__(self, settings):
	pass

	def download_request(self, request, spider):
	d = defer.Deferred()
	tab = BrowserTab(
	network_manager=nm_factory(),
	splash_proxy_factory=None,
	verbosity=3,
	render_options=RenderOptions({
	"uid": str(random.randint(0, 10**10))
	}, defaults.MAX_TIMEOUT),
	visible=False,
	)

	def callback(*args):
	res = TextResponse(request.url, body=tab.html(), encoding='utf-8')
	tab.close()
	d.callback(res)

	tab.go(request.url, callback=callback, errback=d.errback)
	return d

	# Custom settings
	settings = Settings({
	# Override the download handler
	"DOWNLOAD_HANDLERS": {
	'http': 'scrapy_spider.SplashDownloadHandler',
	'https': 'scrapy_spider.SplashDownloadHandler',
	}
	})

	def start_spider():
	runner = CrawlerRunner(settings)
	runner.crawl(BlogSpider)
	crawler = list(runner.crawlers)[0]
	#!/usr/bin/env python

	import splash.server

	def start_spider(**kwargs):
	# We need to wait until splash has started to import any scrapy module
	import scrapy_spider
	scrapy_spider.nm_factory = kwargs['network_manager_factory']
	scrapy_spider.start_spider()

	if __name__ == '__main__':
	splash.server.main(server_factory=start_spider, argv=[])