Created
September 30, 2017 00:46
-
-
Save gboudreau/2f74a58adea787a2d0efffd2bd6cd26d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# HG changeset patch | |
# Parent 60085c8f01fe4eb19a1c38a2d27fd77698b5a5ec | |
Issue #24363: Add policy flag to avoid parsing HTTP header as email body | |
diff -r 60085c8f01fe Lib/email/errors.py | |
--- a/Lib/email/errors.py Thu Sep 08 22:37:34 2016 -0400 | |
+++ b/Lib/email/errors.py Mon Jan 23 23:39:53 2017 +0000 | |
@@ -55,8 +55,9 @@ | |
class MissingHeaderBodySeparatorDefect(MessageDefect): | |
"""Found line with no leading whitespace and no colon before blank line.""" | |
-# XXX: backward compatibility, just in case (it was never emitted). | |
-MalformedHeaderDefect = MissingHeaderBodySeparatorDefect | |
+ | |
+class MalformedHeaderDefect(MessageDefect): | |
+ """An ordinary header line did not match the expected format.""" | |
class MultipartInvariantViolationDefect(MessageDefect): | |
"""A message claimed to be a multipart but no subparts were found.""" | |
diff -r 60085c8f01fe Lib/email/feedparser.py | |
--- a/Lib/email/feedparser.py Thu Sep 08 22:37:34 2016 -0400 | |
+++ b/Lib/email/feedparser.py Mon Jan 23 23:39:53 2017 +0000 | |
@@ -169,6 +169,9 @@ | |
self._last = None | |
self._headersonly = False | |
+ # True to parse a HTTP header section that is detached from any body | |
+ self.__body_detached = getattr(policy, "_py_body_detached", False) | |
+ | |
# Non-public interface for supporting Parser's headersonly flag | |
def _set_headersonly(self): | |
self._headersonly = True | |
@@ -233,9 +236,18 @@ | |
# (i.e. newline), just throw it away. Otherwise the line is | |
# part of the body so push it back. | |
if not NLCRE.match(line): | |
- defect = errors.MissingHeaderBodySeparatorDefect() | |
+ if self.__body_detached: | |
+ defect = "Invalid header line: " + repr(line) | |
+ defect = errors.MalformedHeaderDefect(defect) | |
+ else: | |
+ defect = errors.MissingHeaderBodySeparatorDefect() | |
+ self._input.unreadline(line) | |
self.policy.handle_defect(self._cur, defect) | |
- self._input.unreadline(line) | |
+ if self.__body_detached: | |
+ # Even in the case of a blank line, this could be "\r", | |
+ # which the HTTP parser does not consider to be the last | |
+ # line, so ignore it in case other header fields follow | |
+ continue | |
break | |
headers.append(line) | |
# Done with the headers, so parse them and figure out what we're | |
@@ -322,7 +334,7 @@ | |
lines.append(line) | |
self._cur.set_payload(EMPTYSTRING.join(lines)) | |
return | |
- # Make sure a valid content type was specified per RFC 2045:6.4. | |
+ # Make sure a valid encoding was specified per RFC 2045:6.4. | |
if (self._cur.get('content-transfer-encoding', '8bit').lower() | |
not in ('7bit', '8bit', 'binary')): | |
defect = errors.InvalidMultipartContentTransferEncodingDefect() | |
@@ -499,7 +511,7 @@ | |
line = line[:-len(mo.group(0))] | |
self._cur.set_unixfrom(line) | |
continue | |
- elif lineno == len(lines) - 1: | |
+ elif not self.__body_detached and lineno == len(lines) - 1: | |
# Something looking like a unix-from at the end - it's | |
# probably the first line of the body, so push back the | |
# line and stop. | |
@@ -511,10 +523,13 @@ | |
defect = errors.MisplacedEnvelopeHeaderDefect(line) | |
self._cur.defects.append(defect) | |
continue | |
+ | |
# Split the line on the colon separating field name from value. | |
- # There will always be a colon, because if there wasn't the part of | |
- # the parser that calls us would have started parsing the body. | |
- i = line.find(':') | |
+ # There will always be a colon, because if there wasn't, | |
+ # it would have been picked up by the part of | |
+ # the parser that calls us, or the continuation or envelope | |
+ # checks above. | |
+ i = line.index(':') | |
# If the colon is on the start of the line the header is clearly | |
# malformed, but we might be able to salvage the rest of the | |
@@ -524,7 +539,6 @@ | |
self._cur.defects.append(defect) | |
continue | |
- assert i>0, "_parse_headers fed line with no : and no leading WS" | |
lastheader = line[:i] | |
lastvalue = [line] | |
# Done with all the lines, so handle the last header. | |
diff -r 60085c8f01fe Lib/http/client.py | |
--- a/Lib/http/client.py Thu Sep 08 22:37:34 2016 -0400 | |
+++ b/Lib/http/client.py Mon Jan 23 23:39:53 2017 +0000 | |
@@ -69,6 +69,7 @@ | |
""" | |
import email.parser | |
+import email.policy | |
import email.message | |
import http | |
import io | |
@@ -191,14 +192,16 @@ | |
lst.append(line) | |
return lst | |
+class _Policy(email.policy.Compat32): | |
+ _py_body_detached = True | |
+ | |
def parse_headers(fp, _class=HTTPMessage): | |
"""Parses only RFC2822 headers from a file pointer. | |
- email Parser wants to see strings rather than bytes. | |
- But a TextIOWrapper around self.rfile would buffer too many bytes | |
- from the stream, bytes which we later need to read as bytes. | |
- So we read the correct bytes here, as bytes, for email Parser | |
- to parse. | |
+ The parser works with text strings rather than bytes. | |
+ But a TextIOWrapper may internally buffer too many bytes from the stream, | |
+ bytes which we later need to read. So we read the correct number of | |
+ bytes from the stream before decoding them to text to be parsed. | |
""" | |
headers = [] | |
@@ -212,7 +215,8 @@ | |
if line in (b'\r\n', b'\n', b''): | |
break | |
hstring = b''.join(headers).decode('iso-8859-1') | |
- return email.parser.Parser(_class=_class).parsestr(hstring) | |
+ parser = email.parser.Parser(_class=_class, policy=_Policy()) | |
+ return parser.parsestr(hstring) | |
class HTTPResponse(io.BufferedIOBase): |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment