Last active
June 26, 2021 08:02
-
-
Save clemfromspace/bbe9a99da1e86a753baa00e394234b8c to your computer and use it in GitHub Desktop.
Scrapy with Puppeteer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This module contains the ``SeleniumMiddleware`` scrapy middleware""" | |
import asyncio | |
from pyppeteer import launch | |
from scrapy import signals | |
from scrapy.http import HtmlResponse | |
from twisted.internet import defer | |
from .http import PuppeteerRequest | |
def _force_deferred(coro): | |
dfd = defer.Deferred().addCallback(lambda f: f.result()) | |
future = asyncio.ensure_future(coro) | |
future.add_done_callback(dfd.callback) | |
return dfd | |
class PuppeteerMiddleware: | |
"""Scrapy middleware handling the requests using puppeteer""" | |
browser = None | |
@classmethod | |
async def _from_crawler(cls, crawler): | |
middleware = cls() | |
middleware.browser = await launch() | |
crawler.signals.connect(middleware.spider_closed, signals.spider_closed) | |
return middleware | |
@classmethod | |
def from_crawler(cls, crawler): | |
"""Initialize the middleware""" | |
return _force_deferred(cls._from_crawler(crawler)) | |
async def _process_request(self, request, spider): | |
"""Process a request using puppeteer if applicable""" | |
if not isinstance(request, PuppeteerRequest): | |
return None | |
page = await self.browser.newPage() | |
await page.setCookie(request.cookies) | |
await page.setViewport({'width': 1200, 'height': 900, 'deviceScaleFactor': 2}) | |
await page.goto(request.url, {'waitUntil': request.wait_until}) | |
if request.screenshot: | |
request.meta['screenshot'] = await page.screenshot() | |
body = await page.content() | |
return HtmlResponse( | |
page.url(), | |
body=body, | |
encoding='utf-8', | |
request=request | |
) | |
def process_request(self, request, spider): | |
return _force_deferred(self._process_request(request, spider)) | |
async def _spider_closed(self): | |
await self.browser.close() | |
def spider_closed(self): | |
"""Shutdown the driver when spider is closed""" | |
return _force_deferred(self._spider_closed()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This module contains the ``SeleniumRequest`` class""" | |
from scrapy import Request | |
class PuppeteerRequest(Request): | |
"""Scrapy ``Request`` subclass providing additional arguments""" | |
def __init__(self, screenshot=False, *args, **kwargs): | |
"""Initialize a new Puppeteer request | |
Parameters | |
---------- | |
screenshot: bool | |
If True, a screenshot of the page will be taken and the data of the screenshot | |
will be returned in the response "meta" attribute. | |
""" | |
self.screenshot = screenshot | |
super().__init__(*args, **kwargs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the gist. What about POST requests?