Skip to content

Instantly share code, notes, and snippets.

@clemfromspace
Last active June 26, 2021 08:02
Show Gist options
  • Select an option

  • Save clemfromspace/bbe9a99da1e86a753baa00e394234b8c to your computer and use it in GitHub Desktop.

Select an option

Save clemfromspace/bbe9a99da1e86a753baa00e394234b8c to your computer and use it in GitHub Desktop.
Scrapy with Puppeteer
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
import asyncio
from pyppeteer import launch
from scrapy import signals
from scrapy.http import HtmlResponse
from twisted.internet import defer
from .http import PuppeteerRequest
def _force_deferred(coro):
dfd = defer.Deferred().addCallback(lambda f: f.result())
future = asyncio.ensure_future(coro)
future.add_done_callback(dfd.callback)
return dfd
class PuppeteerMiddleware:
"""Scrapy middleware handling the requests using puppeteer"""
browser = None
@classmethod
async def _from_crawler(cls, crawler):
middleware = cls()
middleware.browser = await launch()
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware"""
return _force_deferred(cls._from_crawler(crawler))
async def _process_request(self, request, spider):
"""Process a request using puppeteer if applicable"""
if not isinstance(request, PuppeteerRequest):
return None
page = await self.browser.newPage()
await page.setCookie(request.cookies)
await page.setViewport({'width': 1200, 'height': 900, 'deviceScaleFactor': 2})
await page.goto(request.url, {'waitUntil': request.wait_until})
if request.screenshot:
request.meta['screenshot'] = await page.screenshot()
body = await page.content()
return HtmlResponse(
page.url(),
body=body,
encoding='utf-8',
request=request
)
def process_request(self, request, spider):
return _force_deferred(self._process_request(request, spider))
async def _spider_closed(self):
await self.browser.close()
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
return _force_deferred(self._spider_closed())
"""This module contains the ``SeleniumRequest`` class"""
from scrapy import Request
class PuppeteerRequest(Request):
"""Scrapy ``Request`` subclass providing additional arguments"""
def __init__(self, screenshot=False, *args, **kwargs):
"""Initialize a new Puppeteer request
Parameters
----------
screenshot: bool
If True, a screenshot of the page will be taken and the data of the screenshot
will be returned in the response "meta" attribute.
"""
self.screenshot = screenshot
super().__init__(*args, **kwargs)
@NikolayGalkin
Copy link
Copy Markdown

Thanks for the gist. What about POST requests?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment