Last active
October 26, 2023 16:17
-
-
Save wvengen/84af3ab0da2ec5134407a71a094c8ad1 to your computer and use it in GitHub Desktop.
Scrapy download handler for HTTP/2 over a HTTP/1 proxy (TLS only).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Download handler for HTTP/2 supporting proxy CONNECT over HTTP/1. | |
You can use this by configuring it in the spider settings: | |
DOWNLOAD_HANDLERS = { | |
'https': 'scrapy_h2_proxy.H2DownloadHandler', | |
} | |
Tested with Scrapy 2.5.0. | |
""" | |
import ipaddress | |
from collections import deque | |
from typing import Optional, Tuple | |
from twisted.internet import defer | |
from twisted.internet.base import ReactorBase | |
from twisted.internet.defer import Deferred | |
from twisted.internet.endpoints import HostnameEndpoint | |
from twisted.web.client import URI, BrowserLikePolicyForHTTPS, _StandardEndpointFactory | |
from scrapy.core.http2.agent import H2Agent, H2ConnectionPool as _H2ConnectionPool, H2ClientProtocol as _H2ClientProtocol, H2ClientFactory as _H2ClientFactory | |
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings | |
from scrapy.core.downloader.webclient import _parse | |
from scrapy.core.downloader.handlers.http11 import TunnelingTCP4ClientEndpoint | |
from scrapy.core.downloader.handlers.http2 import ScrapyH2Agent as _ScrapyH2Agent, H2DownloadHandler as _H2DownloadHandler | |
from scrapy.crawler import Crawler | |
from scrapy.spiders import Spider | |
from scrapy.http.request import Request | |
from scrapy.settings import Settings | |
from scrapy.utils.python import to_bytes | |
class H2ClientProtocol(_H2ClientProtocol): | |
def connectionMade(self) -> None: | |
"""Called by Twisted when the connection is established. We can start | |
sending some data now: we should open with the connection preamble. | |
The parent sends the preamble here, but that we need to avoid for the | |
HTTP/1 proxying connection, hence we do it after TLS negotiation instead. | |
""" | |
# Initialize the timeout | |
self.setTimeout(self.IDLE_TIMEOUT) | |
destination = self.transport.getPeer() | |
self.metadata['ip_address'] = ipaddress.ip_address(destination.host) | |
# omitting: Initiate H2 Connection | |
def handshakeCompleted(self) -> None: | |
""" | |
Initiate H2 Connection | |
""" | |
# Do not call super() because it checks self.transport.negotiatedProtocol which somehow isn't defined in our case. | |
#super(H2ClientProtocol, self).handshakeCompleted() | |
# Initiate H2 Connection | |
self.conn.initiate_connection() | |
self._write_to_transport() | |
class H2ClientFactory(_H2ClientFactory): | |
def buildProtocol(self, addr) -> H2ClientProtocol: | |
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory.""" | |
return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred) | |
class H2ConnectionPool(_H2ConnectionPool): | |
def _new_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred: | |
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory.""" | |
self._pending_requests[key] = deque() | |
conn_lost_deferred = Deferred() | |
conn_lost_deferred.addCallback(self._remove_connection, key) | |
factory = H2ClientFactory(uri, self.settings, conn_lost_deferred) | |
conn_d = endpoint.connect(factory) | |
conn_d.addCallback(self.put_connection, key) | |
d = Deferred() | |
self._pending_requests[key].append(d) | |
return d | |
class ScrapyTunnelingH2OverH1Agent(H2Agent): | |
"""An agent that uses a L{TunnelingH2OverH1ClientEndpoint} to make HTTP/2 | |
downloads. It may look strange that we have chosen to subclass Agent and not | |
ProxyAgent but consider that after the tunnel is opened the proxy is | |
transparent to the client; thus the agent should behave like there is no | |
proxy involved. | |
""" | |
def __init__( | |
self, | |
reactor: ReactorBase, | |
proxy_auth: Optional[str], | |
proxy_uri: URI, | |
pool: H2ConnectionPool, | |
context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(), | |
connect_timeout: Optional[float] = None, | |
bind_address: Optional[bytes] = None, | |
) -> None: | |
super(ScrapyTunnelingH2OverH1Agent, self).__init__( | |
reactor=reactor, | |
pool=pool, | |
context_factory=context_factory, | |
connect_timeout=connect_timeout, | |
bind_address=bind_address, | |
) | |
self._proxy_uri = proxy_uri | |
self._proxy_auth = proxy_auth | |
def get_endpoint(self, uri: URI): | |
return TunnelingTCP4ClientEndpoint( | |
reactor=self._reactor, | |
host=uri.host, | |
port=uri.port, | |
proxyConf=(self._proxy_uri.host, self._proxy_uri.port, self._proxy_auth), | |
contextFactory=self._context_factory, | |
timeout=self.endpoint_factory._connectTimeout, | |
bindAddress=self.endpoint_factory._bindAddress, | |
) | |
def get_key(self, uri: URI) -> Tuple: | |
"""We use a combination of the proxy uri and destination uri, they all need their own connection.""" | |
return "http-proxy", self._proxy_uri.host, self._proxy_uri.port, uri.host, uri.port | |
class H2DownloadHandler(_H2DownloadHandler): | |
def __init__(self, settings: Settings, crawler: Optional[Crawler] = None): | |
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory.""" | |
self._crawler = crawler | |
from twisted.internet import reactor | |
self._pool = H2ConnectionPool(reactor, settings) | |
self._context_factory = load_context_factory_from_settings(settings, crawler) | |
def download_request(self, request: Request, spider: Spider) -> Deferred: | |
agent = ScrapyH2Agent( | |
context_factory=self._context_factory, | |
pool=self._pool, | |
crawler=self._crawler, | |
) | |
return agent.download_request(request, spider) | |
class ScrapyH2Agent(_ScrapyH2Agent): | |
_TunnelingH1OverH2Agent = ScrapyTunnelingH2OverH1Agent | |
def __init__( | |
self, context_factory, | |
pool: H2ConnectionPool, | |
connect_timeout: int = 10, | |
bind_address: Optional[bytes] = None, | |
crawler: Optional[Crawler] = None, | |
) -> None: | |
super(ScrapyH2Agent, self).__init__(context_factory, pool, connect_timeout, bind_address, crawler) | |
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent: | |
from twisted.internet import reactor | |
bind_address = request.meta.get('bindaddress') or self._bind_address | |
proxy = request.meta.get('proxy') | |
if proxy: | |
_, _, _proxy_host, _proxy_port, proxy_params = _parse(proxy) | |
scheme = _parse(request.url)[0] | |
omit_connect_tunnel = b'noconnect' in proxy_params | |
if scheme == b'https' and not omit_connect_tunnel: | |
# this is what we want to implement | |
proxy_auth = request.headers.get(b'Proxy-Authorization', None) | |
return self._TunnelingH1OverH2Agent( | |
reactor=reactor, | |
proxy_auth=proxy_auth, | |
context_factory=self._context_factory, | |
proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')), | |
connect_timeout=timeout, | |
bind_address=bind_address, | |
pool=self._pool, | |
) | |
# all the other cases are handled by the regular {ScrapyH2Agent} | |
return super(ScrapyH2Agent, self)._get_agent(request, timeout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@bezkos no