Last active
March 16, 2019 23:32
-
-
Save dogweather/f6ec262198e4516ed380292cd2c8e569 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@classmethod | |
def from_crawler(cls, crawler, *args, **kwargs): | |
"""Register to receive the idle event""" | |
spider = super(SecureSosStateOrUsSpider, cls).from_crawler( | |
crawler, *args, **kwargs | |
) | |
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) | |
return spider | |
def spider_idle(self, spider): | |
"""Schedule a simple request in order to return the collected data""" | |
if self.data_submitted: | |
return | |
# This is a hack: I don't yet know how to schedule a request to just | |
# submit data _without_ also triggering a scrape. So I provide a URL | |
# to a simple site that we're going to ignore. | |
null_request = scrapy.Request("http://neverssl.com/", callback=self.submit_data) | |
self.crawler.engine.schedule(null_request, spider) | |
raise scrapy.exceptions.DontCloseSpider | |
def submit_data(self, _): | |
"""Simply return the collection of all the scraped data. Ignore the actual | |
scraped content. I haven't figured out another way to submit the merged | |
results. | |
To be used as a callback when the spider is idle (i.e., has finished scraping.) | |
""" | |
self.data_submitted = True | |
return self.sportsInventory |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment