Skip to content

Instantly share code, notes, and snippets.

@ldonjibson
Forked from yashrsharma44/quotes.py
Created June 25, 2023 03:45
Show Gist options
  • Save ldonjibson/4c0760cc641bbff26e2fd457341e13c5 to your computer and use it in GitHub Desktop.
Save ldonjibson/4c0760cc641bbff26e2fd457341e13c5 to your computer and use it in GitHub Desktop.
Sample spider for running the new asyncio support in scrapy
import scrapy
from scrapy.Fetch import Fetch
import asyncio
import aiohttp
class QuotesSpider(scrapy.Spider):
name = "quotes"
async def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
async def parse(self, response):
links = [(response.xpath('//@href').extract()[-1])]
links.append(response.xpath('//@href').extract()[-2])
print("Started the aiohttp module!!")
conn = aiohttp.TCPConnector(verify_ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
print("Inside the aiohttp Client Session!!")
html = await self.fetch(session, 'https://python-forum.io/Thread-Exploring-async-await-without-knowing-how-they-work-ahead-of-time?pid=17292')
print(html)
print("Completed the aiohttp!!")
spider = response.spider # One has to get spider and crawler with response, in order to use Fetch. Will work on updating this!
crawler = response.crawler
for link in links:
res = await Fetch(url=link, crawler=crawler, spider=spider) # You can use yield scrapy.Request(...), for using a callback
print("Before the asyncio.sleep!!")
await asyncio.sleep(5)
print("___RESPONSE___and link {!r}__________________________________________________________{!r}".format(link,res))
print("---------------------------END OF PARSE------------------------------------------------")
async def parse2(self, response):
page = response.url.split("/")[-2]
print("/////////////////////-----------IN PARSE 2----------------------------//////////////////////")
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
print("----END OF PARSE2 ------------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment