-
-
Save ldonjibson/4c0760cc641bbff26e2fd457341e13c5 to your computer and use it in GitHub Desktop.
Sample spider for running the new asyncio support in scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from scrapy.Fetch import Fetch | |
import asyncio | |
import aiohttp | |
class QuotesSpider(scrapy.Spider): | |
name = "quotes" | |
async def start_requests(self): | |
urls = [ | |
'http://quotes.toscrape.com/page/1/', | |
'http://quotes.toscrape.com/page/2/' | |
] | |
for url in urls: | |
yield scrapy.Request(url=url, callback=self.parse) | |
async def parse(self, response): | |
links = [(response.xpath('//@href').extract()[-1])] | |
links.append(response.xpath('//@href').extract()[-2]) | |
print("Started the aiohttp module!!") | |
conn = aiohttp.TCPConnector(verify_ssl=False) | |
async with aiohttp.ClientSession(connector=conn) as session: | |
print("Inside the aiohttp Client Session!!") | |
html = await self.fetch(session, 'https://python-forum.io/Thread-Exploring-async-await-without-knowing-how-they-work-ahead-of-time?pid=17292') | |
print(html) | |
print("Completed the aiohttp!!") | |
spider = response.spider # One has to get spider and crawler with response, in order to use Fetch. Will work on updating this! | |
crawler = response.crawler | |
for link in links: | |
res = await Fetch(url=link, crawler=crawler, spider=spider) # You can use yield scrapy.Request(...), for using a callback | |
print("Before the asyncio.sleep!!") | |
await asyncio.sleep(5) | |
print("___RESPONSE___and link {!r}__________________________________________________________{!r}".format(link,res)) | |
print("---------------------------END OF PARSE------------------------------------------------") | |
async def parse2(self, response): | |
page = response.url.split("/")[-2] | |
print("/////////////////////-----------IN PARSE 2----------------------------//////////////////////") | |
filename = 'quotes-%s.html' % page | |
with open(filename, 'wb') as f: | |
f.write(response.body) | |
self.log('Saved file %s' % filename) | |
print("----END OF PARSE2 ------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment