Created
April 20, 2016 07:53
-
-
Save dvdbng/c0f8a07fd812beca82e19c3164fd82c6 to your computer and use it in GitHub Desktop.
Use scrapy and splash in the same process
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from scrapy.crawler import CrawlerRunner | |
from scrapy.settings import Settings | |
from scrapy.http import TextResponse | |
from twisted.internet import defer | |
from splash.browser_tab import BrowserTab | |
from splash.render_options import RenderOptions | |
from splash import defaults | |
import random | |
nm_factory = None | |
class BlogSpider(scrapy.Spider): | |
""" | |
The main spider class | |
""" | |
name = 'blogspider' | |
start_urls = ['https://blog.scrapinghub.com'] | |
def parse(self, response): | |
for url in response.css('ul li a::attr("href")').re('.*/category/.*'): | |
yield scrapy.Request(response.urljoin(url), self.parse_titles) | |
def parse_titles(self, response): | |
for post_title in response.css('div.entries > ul > li a::text').extract(): | |
yield {'title': post_title} | |
class SplashDownloadHandler(object): | |
""" | |
Download handler that creates a splash tab for each requested url | |
""" | |
def __init__(self, settings): | |
pass | |
def download_request(self, request, spider): | |
d = defer.Deferred() | |
tab = BrowserTab( | |
network_manager=nm_factory(), | |
splash_proxy_factory=None, | |
verbosity=3, | |
render_options=RenderOptions({ | |
"uid": str(random.randint(0, 10**10)) | |
}, defaults.MAX_TIMEOUT), | |
visible=False, | |
) | |
def callback(*args): | |
res = TextResponse(request.url, body=tab.html(), encoding='utf-8') | |
tab.close() | |
d.callback(res) | |
tab.go(request.url, callback=callback, errback=d.errback) | |
return d | |
# Custom settings | |
settings = Settings({ | |
# Override the download handler | |
"DOWNLOAD_HANDLERS": { | |
'http': 'scrapy_spider.SplashDownloadHandler', | |
'https': 'scrapy_spider.SplashDownloadHandler', | |
} | |
}) | |
def start_spider(): | |
runner = CrawlerRunner(settings) | |
runner.crawl(BlogSpider) | |
crawler = list(runner.crawlers)[0] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import splash.server | |
def start_spider(**kwargs): | |
# We need to wait until splash has started to import any scrapy module | |
import scrapy_spider | |
scrapy_spider.nm_factory = kwargs['network_manager_factory'] | |
scrapy_spider.start_spider() | |
if __name__ == '__main__': | |
splash.server.main(server_factory=start_spider, argv=[]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment