Last active
July 19, 2023 20:02
-
-
Save pawelmhm/176a4d01aea93c65bd64155c761fcc7d to your computer and use it in GitHub Desktop.
Scrapy downloader that preserves header order
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import absolute_import, division, unicode_literals | |
import logging | |
from collections import OrderedDict | |
from time import time | |
from urllib.parse import urldefrag | |
from scrapy.core.downloader import Downloader | |
from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, \ | |
ScrapyAgent, _RequestBodyProducer | |
from scrapy.utils.python import to_bytes | |
from twisted.internet import reactor | |
from twisted.web.http_headers import Headers as TxHeaders | |
logger = logging.getLogger(__name__) | |
# To use that you need to do following. | |
# 1. Save it in some files, for example my_scrapy_project/ordered_downloader.py | |
# 2. Define this downloader in settings.py of your Scrapy project or in spider. | |
# For example in settings | |
# DOWNLOAD_HANDLERS_BASE = { | |
# 'data': 'scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler', | |
# 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', | |
# 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', | |
# Custom downloader to preserve header | |
# order. Add path where you store your downloader. | |
# 'https': 'my_project.ordered_downloader.HeaderOrderDownloader', | |
# 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', | |
# 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', | |
# } | |
class OrderedHeaders(TxHeaders): | |
# Tweaked Twisted headers object that stores headers as OrderedDict and defines | |
# some order. First it sets ordered headers, then it adds headers that are not | |
# in defined ordering. | |
def __init__(self, rawHeaders=None): | |
# define your order here | |
ordering = [b'Host', b'User-Agent', b'Accept-Encoding', b'Accept', b'Connection', b'Cookie'] | |
self._rawHeaders = OrderedDict() | |
if rawHeaders is not None: | |
# Set ordered headers | |
for key in ordering: | |
values = rawHeaders.get(key) | |
if values and not isinstance(values, list): | |
values = rawHeaders.getlist(key) | |
self.setRawHeaders(key, values) | |
for name, values in rawHeaders.items(): | |
# Set remaining headers | |
if name not in ordering: | |
self.setRawHeaders(name, values) | |
def setRawHeaders(self, name, values): | |
""" | |
Copy pasted from Twisted, with only addition being adding move_to_end() method, | |
called after setting header. | |
""" | |
if not isinstance(values, list): | |
raise TypeError("Header entry %r should be list but found " | |
"instance of %r instead" % (name, type(values))) | |
name = self._encodeName(name) | |
self._rawHeaders[name] = self._encodeValues(values) | |
self._rawHeaders.move_to_end(name) | |
class ScrapyHeaderOrderAgent(ScrapyAgent): | |
# Copy pasted from Scrapy, tweaked to create OrderedHeaders, not usual Twisted headers, only | |
# difference from Scrapy is on line 81. | |
def download_request(self, request): | |
timeout = request.meta.get('download_timeout') or self._connectTimeout | |
agent = self._get_agent(request, timeout) | |
# request details | |
url = urldefrag(request.url)[0] | |
method = to_bytes(request.method) | |
headers = OrderedHeaders(request.headers) | |
if isinstance(agent, self._TunnelingAgent): | |
headers.removeHeader(b'Proxy-Authorization') | |
if request.body: | |
bodyproducer = _RequestBodyProducer(request.body) | |
elif method == b'POST': | |
# Setting Content-Length: 0 even for POST requests is not a | |
# MUST per HTTP RFCs, but it's common behavior, and some | |
# servers require this, otherwise returning HTTP 411 Length required | |
# | |
# RFC 7230#section-3.3.2: | |
# "a Content-Length header field is normally sent in a POST | |
# request even when the value is 0 (indicating an empty payload body)." | |
# | |
# Twisted < 17 will not add "Content-Length: 0" by itself; | |
# Twisted >= 17 fixes this; | |
# Using a producer with an empty-string sends `0` as Content-Length | |
# for all versions of Twisted. | |
bodyproducer = _RequestBodyProducer(b'') | |
else: | |
bodyproducer = None | |
start_time = time() | |
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, | |
bodyproducer) | |
# set download latency | |
d.addCallback(self._cb_latency, request, start_time) | |
# response body is ready to be consumed | |
d.addCallback(self._cb_bodyready, request) | |
d.addCallback(self._cb_bodydone, request, url) | |
# check download timeout | |
self._timeout_cl = reactor.callLater(timeout, d.cancel) | |
d.addBoth(self._cb_timeout, request, url, timeout) | |
return d | |
class HeaderOrderDownloader(HTTP11DownloadHandler): | |
# Copy pasted from Scrapy, tweaked to use different Agent it uses ScrapyHeaderOrderAgent | |
def download_request(self, request, spider): | |
"""Return a deferred for the HTTP download""" | |
agent = ScrapyHeaderOrderAgent( | |
contextFactory=self._contextFactory, | |
pool=self._pool, | |
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), | |
warnsize=getattr(spider, 'download_warnsize', | |
self._default_warnsize), | |
fail_on_dataloss=self._fail_on_dataloss, | |
) | |
return agent.download_request(request) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Summary
To backpack on top of this, I needed to avoid browser fingerprinting by ordering headers.
My goals were:
Content-Length
for POST requests;My Github repository is here. I hope this will be useful to you. It's very similar to this gist's strategy.
My reddit post
My stackoverflow post
Content-Length
headerAs pawelmhm mentions,
Content-Length
is added by Twisted. Specifically here for non-zero POST body and here for zero body responses. The breadcrumb trail is:ScrapyAgent
->download_request
->agent.request(...)
->TunnelingAgent
-> Twisted:_requestWithEndpoint
->Request._construct
->_writeToBodyProducerContentLength
or_writeToEmptyBodyContentLength
.These functions,
_writeToBodyProducerContentLength
and_writeToEmptyBodyContentLength
call_writeHeaders
and do something likeself._writeHeaders(transport, b"Content-Length: 0\r\n")
. As a result,Content-Length
is added to the top of your headers. If you try to includeContent-Length
, something like this will happen and you will get a400
error.As a result, if you want to control
Content-Length
, you must edit the Twisted code itself. See my repository README.md for instructions on that.Case sensitive headers
This is more of a well-documented aspect. To do this, you modify the
_caseMappings
attribute of the internal Twisted headers class. You are writing an over-write that tells Twisted to not abide by usual behavior for this header.Let's say you have the header
aAbBcCdD
, and you see it is getting sent asAabbccdd
. You can go into your spider and before the request is yielded, add the code:The key is made lowercase because twisted encodes the name into bytes via this function which calls
.lower()
method.Additionally resources for that can be found here:
Scrapy #2711
Developing
If you want to confirm if your headers are being sent as you expect, I'd recommend using Charles proxy or Fiddler. You can download Fiddler here. You can configure Fiddler for python applications by something like this. Essentially, you will end up sending your request to a localhost proxy by specifying in the meta attribute of the request,
meta = {'proxy': '127.0.0.1:8866'}
.