Skip to content

Instantly share code, notes, and snippets.

@wvengen
Last active October 26, 2023 16:17
Show Gist options
  • Save wvengen/84af3ab0da2ec5134407a71a094c8ad1 to your computer and use it in GitHub Desktop.
Save wvengen/84af3ab0da2ec5134407a71a094c8ad1 to your computer and use it in GitHub Desktop.
Scrapy download handler for HTTP/2 over a HTTP/1 proxy (TLS only).
"""
Download handler for HTTP/2 supporting proxy CONNECT over HTTP/1.
You can use this by configuring it in the spider settings:
DOWNLOAD_HANDLERS = {
'https': 'scrapy_h2_proxy.H2DownloadHandler',
}
Tested with Scrapy 2.5.0.
"""
import ipaddress
from collections import deque
from typing import Optional, Tuple
from twisted.internet import defer
from twisted.internet.base import ReactorBase
from twisted.internet.defer import Deferred
from twisted.internet.endpoints import HostnameEndpoint
from twisted.web.client import URI, BrowserLikePolicyForHTTPS, _StandardEndpointFactory
from scrapy.core.http2.agent import H2Agent, H2ConnectionPool as _H2ConnectionPool, H2ClientProtocol as _H2ClientProtocol, H2ClientFactory as _H2ClientFactory
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
from scrapy.core.downloader.webclient import _parse
from scrapy.core.downloader.handlers.http11 import TunnelingTCP4ClientEndpoint
from scrapy.core.downloader.handlers.http2 import ScrapyH2Agent as _ScrapyH2Agent, H2DownloadHandler as _H2DownloadHandler
from scrapy.crawler import Crawler
from scrapy.spiders import Spider
from scrapy.http.request import Request
from scrapy.settings import Settings
from scrapy.utils.python import to_bytes
class H2ClientProtocol(_H2ClientProtocol):
def connectionMade(self) -> None:
"""Called by Twisted when the connection is established. We can start
sending some data now: we should open with the connection preamble.
The parent sends the preamble here, but that we need to avoid for the
HTTP/1 proxying connection, hence we do it after TLS negotiation instead.
"""
# Initialize the timeout
self.setTimeout(self.IDLE_TIMEOUT)
destination = self.transport.getPeer()
self.metadata['ip_address'] = ipaddress.ip_address(destination.host)
# omitting: Initiate H2 Connection
def handshakeCompleted(self) -> None:
"""
Initiate H2 Connection
"""
# Do not call super() because it checks self.transport.negotiatedProtocol which somehow isn't defined in our case.
#super(H2ClientProtocol, self).handshakeCompleted()
# Initiate H2 Connection
self.conn.initiate_connection()
self._write_to_transport()
class H2ClientFactory(_H2ClientFactory):
def buildProtocol(self, addr) -> H2ClientProtocol:
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred)
class H2ConnectionPool(_H2ConnectionPool):
def _new_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
self._pending_requests[key] = deque()
conn_lost_deferred = Deferred()
conn_lost_deferred.addCallback(self._remove_connection, key)
factory = H2ClientFactory(uri, self.settings, conn_lost_deferred)
conn_d = endpoint.connect(factory)
conn_d.addCallback(self.put_connection, key)
d = Deferred()
self._pending_requests[key].append(d)
return d
class ScrapyTunnelingH2OverH1Agent(H2Agent):
"""An agent that uses a L{TunnelingH2OverH1ClientEndpoint} to make HTTP/2
downloads. It may look strange that we have chosen to subclass Agent and not
ProxyAgent but consider that after the tunnel is opened the proxy is
transparent to the client; thus the agent should behave like there is no
proxy involved.
"""
def __init__(
self,
reactor: ReactorBase,
proxy_auth: Optional[str],
proxy_uri: URI,
pool: H2ConnectionPool,
context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
connect_timeout: Optional[float] = None,
bind_address: Optional[bytes] = None,
) -> None:
super(ScrapyTunnelingH2OverH1Agent, self).__init__(
reactor=reactor,
pool=pool,
context_factory=context_factory,
connect_timeout=connect_timeout,
bind_address=bind_address,
)
self._proxy_uri = proxy_uri
self._proxy_auth = proxy_auth
def get_endpoint(self, uri: URI):
return TunnelingTCP4ClientEndpoint(
reactor=self._reactor,
host=uri.host,
port=uri.port,
proxyConf=(self._proxy_uri.host, self._proxy_uri.port, self._proxy_auth),
contextFactory=self._context_factory,
timeout=self.endpoint_factory._connectTimeout,
bindAddress=self.endpoint_factory._bindAddress,
)
def get_key(self, uri: URI) -> Tuple:
"""We use a combination of the proxy uri and destination uri, they all need their own connection."""
return "http-proxy", self._proxy_uri.host, self._proxy_uri.port, uri.host, uri.port
class H2DownloadHandler(_H2DownloadHandler):
def __init__(self, settings: Settings, crawler: Optional[Crawler] = None):
"""This method is exactly the same as its parent, but here H2ClientFactory references our own factory."""
self._crawler = crawler
from twisted.internet import reactor
self._pool = H2ConnectionPool(reactor, settings)
self._context_factory = load_context_factory_from_settings(settings, crawler)
def download_request(self, request: Request, spider: Spider) -> Deferred:
agent = ScrapyH2Agent(
context_factory=self._context_factory,
pool=self._pool,
crawler=self._crawler,
)
return agent.download_request(request, spider)
class ScrapyH2Agent(_ScrapyH2Agent):
_TunnelingH1OverH2Agent = ScrapyTunnelingH2OverH1Agent
def __init__(
self, context_factory,
pool: H2ConnectionPool,
connect_timeout: int = 10,
bind_address: Optional[bytes] = None,
crawler: Optional[Crawler] = None,
) -> None:
super(ScrapyH2Agent, self).__init__(context_factory, pool, connect_timeout, bind_address, crawler)
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
from twisted.internet import reactor
bind_address = request.meta.get('bindaddress') or self._bind_address
proxy = request.meta.get('proxy')
if proxy:
_, _, _proxy_host, _proxy_port, proxy_params = _parse(proxy)
scheme = _parse(request.url)[0]
omit_connect_tunnel = b'noconnect' in proxy_params
if scheme == b'https' and not omit_connect_tunnel:
# this is what we want to implement
proxy_auth = request.headers.get(b'Proxy-Authorization', None)
return self._TunnelingH1OverH2Agent(
reactor=reactor,
proxy_auth=proxy_auth,
context_factory=self._context_factory,
proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
connect_timeout=timeout,
bind_address=bind_address,
pool=self._pool,
)
# all the other cases are handled by the regular {ScrapyH2Agent}
return super(ScrapyH2Agent, self)._get_agent(request, timeout)
@bezkos
Copy link

bezkos commented Jul 19, 2022

@wvengen Is it possible to adjust this h2 handler to close proxy connection when every h2 request finished?
My problem is that h2 dont support "Connection: Close" header so even if i pass this specific header, proxy dont close connection and i dont have a IP refresh.

@pozernishku Did u find any solution to your problem cause i have a similar one with "Connection: close"?

@pozernishku
Copy link

@bezkos no

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment