Last active
August 29, 2015 14:05
-
-
Save smerritt/fe83af7b38b82502cec1 to your computer and use it in GitHub Desktop.
If a a MIME streams in a forest, does anyone care?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# An overly-strict multipart-MIME parser thingy. Works for my contrived | |
# examples, at least. Does things in a streaming fashion so you can deal with | |
# large (multi-GB) multipart MIME messages without using all your memory. | |
import email.parser | |
import pprint | |
import random | |
import sys | |
MAX_HEADERS_SIZE = 1024 * 1024 # 1 MiB | |
def extract_boundary(buf, document_iter): | |
""" | |
Consume the headers off a multipart MIME message, finds the boundary. | |
:param message: string containing the first part of a multipart MIME | |
message | |
:returns: MIME boundary | |
""" | |
headers = consume_headers(buf, document_iter) | |
if 'Content-Type' not in headers: | |
raise ValueError("couldn't find boundary in %r" % (headers,)) | |
ct = headers['Content-Type'] | |
# e.g. Content-Type: multipart/mixed; boundary=97d1eb16bd0e6d559bc30e3c43 | |
boundary_value = ct.split(';', 1)[1] | |
boundary_str = boundary_value.split('=', 1)[1] | |
if boundary_str.startswith('"') and boundary_str.endswith('"'): | |
return boundary_str[1:-1] | |
else: | |
return boundary_str | |
def consume_until(marker, buf, document_iter): | |
""" | |
Consume the document until a particular marker string is found. Yields | |
document chunks before the marker. Note: these document chunks are not | |
necessarily the chunks that document_iter yields. | |
""" | |
marker_len = len(marker) | |
marker_index = buf[0].find(marker) | |
while marker_index < 0: | |
buf[0] += next(document_iter) | |
# Hold on to enough bytes that we are certain not to miss the marker. | |
# If the marker were partially in buf[0] and partially in subsequent | |
# chunks, then a simple 'marker in buf[0]' would miss it. Thus, we | |
# hang onto some data until we're certain that it doesn't contain the | |
# marker, and only then yield it to the caller. | |
if len(buf[0]) > marker_len: | |
yield buf[0][:marker_len] | |
buf[0] = buf[0][marker_len:] | |
marker_index = buf[0].find(marker) | |
if marker_index > 0: | |
yield buf[0][:marker_index] | |
# Consume the marker off the input, and we're done | |
buf[0] = buf[0][(marker_index + len(marker)):] | |
def consume_headers(buf, document_iter): | |
# If your message is using non-CRLF line endings, you need to go fix it. | |
# This isn't one of those permissive parsers that eats any old garbage | |
# thrown at it. This parser can handle RFC2046-compliant messages, and | |
# that's all it's designed to do. | |
end_headers_index = buf[0].find('\r\n\r\n') | |
while end_headers_index < 0: | |
buf[0] += next(document_iter) | |
if len(buf[0]) > MAX_HEADERS_SIZE: | |
raise ValueError("Headers too big! Buffered %d bytes, " | |
"but only allowed %d" % (len(buf[0]), | |
MAX_HEADERS_SIZE)) | |
end_headers_index = buf[0].find('\r\n\r\n') | |
split_point = end_headers_index + 4 # 4 = len('\r\n\r\n') | |
headers = buf[0][:split_point] | |
buf[0] = buf[0][split_point:] | |
return dict(email.parser.Parser().parsestr(headers).items()) | |
def multipart_mime_parser_generator(document_iter): | |
""" | |
Process a document as multipart/mime in a streaming fashion. | |
Returns an iterable of (header, part_iter) pairs. | |
The caller is responsible for completely consuming each part_iter prior to | |
requesting the next (header, part_iter) pair. | |
:param document_iter: iterable yielding chunks of the document | |
""" | |
buf = [''] | |
# First, get the boundary. Can't parse the rest of the message until we | |
# know what the boundary is. | |
boundary = extract_boundary(buf, document_iter) | |
part_boundary = "\r\n--%s" % boundary | |
for _junk in consume_until(part_boundary, buf, document_iter): | |
pass | |
while True: | |
# Now we're looking at the end of a delimiter line; we've consumed the | |
# "--yurts" part. Now we need to see if we have "--\r\n", which would | |
# indicate end-of-message ("--yurts--\r\n" is an end delimiter, which | |
# terminates the message) or if we simply have \r\n, which is a plain | |
# separator ("--yurts\r\n" separates message parts). | |
# | |
# According to RFC 2046, there's a "transport-padding" that comes | |
# before \r\n and consists of linear white space (space and tab). | |
# However, it also says that though composers MUST NOT generate | |
# non-empty transport padding, consumers MUST be able to handle | |
# padding added by message transports. However, we're stuffing data | |
# through a TCP socket; we're not emailing it or posting it to Usenet. | |
# Let's just pretend we didn't read that part of the RFC and assume | |
# there's never any transport padding. | |
while len(buf[0]) < 4: # 4 = len("--\r\n") | |
buf[0] += next(document_iter) | |
if buf[0].startswith("--\r\n"): | |
# Found an end delimiter; we're done. | |
return | |
elif buf[0].startswith("\r\n"): | |
# Found a delimiter; clean it out of the buffer and keep parsing | |
buf[0] = buf[0][2:] | |
else: | |
# Neither a delimiter nor a close delimiter; this document is | |
# malformed. | |
raise ValueError("Malformed delimiter in %r" % buf[0]) | |
headers = consume_headers(buf, document_iter) | |
part_iter = consume_until(part_boundary, buf, document_iter) | |
yield (headers, part_iter) | |
if __name__ == '__main__': | |
for fname in sys.argv[1:]: | |
with open(fname) as fp: | |
reader = iter(lambda: fp.read(random.randint(1, 1024)), '') | |
gen = multipart_mime_parser_generator(reader) | |
for h, b in gen: | |
pprint.pprint(h) | |
pprint.pprint(list(b)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment