Skip to content

Instantly share code, notes, and snippets.

@smerritt
Last active August 29, 2015 14:05
Show Gist options
  • Save smerritt/fe83af7b38b82502cec1 to your computer and use it in GitHub Desktop.
Save smerritt/fe83af7b38b82502cec1 to your computer and use it in GitHub Desktop.
If a a MIME streams in a forest, does anyone care?
#!/usr/bin/env python
#
# An overly-strict multipart-MIME parser thingy. Works for my contrived
# examples, at least. Does things in a streaming fashion so you can deal with
# large (multi-GB) multipart MIME messages without using all your memory.
import email.parser
import pprint
import random
import sys
MAX_HEADERS_SIZE = 1024 * 1024 # 1 MiB
def extract_boundary(buf, document_iter):
"""
Consume the headers off a multipart MIME message, finds the boundary.
:param message: string containing the first part of a multipart MIME
message
:returns: MIME boundary
"""
headers = consume_headers(buf, document_iter)
if 'Content-Type' not in headers:
raise ValueError("couldn't find boundary in %r" % (headers,))
ct = headers['Content-Type']
# e.g. Content-Type: multipart/mixed; boundary=97d1eb16bd0e6d559bc30e3c43
boundary_value = ct.split(';', 1)[1]
boundary_str = boundary_value.split('=', 1)[1]
if boundary_str.startswith('"') and boundary_str.endswith('"'):
return boundary_str[1:-1]
else:
return boundary_str
def consume_until(marker, buf, document_iter):
"""
Consume the document until a particular marker string is found. Yields
document chunks before the marker. Note: these document chunks are not
necessarily the chunks that document_iter yields.
"""
marker_len = len(marker)
marker_index = buf[0].find(marker)
while marker_index < 0:
buf[0] += next(document_iter)
# Hold on to enough bytes that we are certain not to miss the marker.
# If the marker were partially in buf[0] and partially in subsequent
# chunks, then a simple 'marker in buf[0]' would miss it. Thus, we
# hang onto some data until we're certain that it doesn't contain the
# marker, and only then yield it to the caller.
if len(buf[0]) > marker_len:
yield buf[0][:marker_len]
buf[0] = buf[0][marker_len:]
marker_index = buf[0].find(marker)
if marker_index > 0:
yield buf[0][:marker_index]
# Consume the marker off the input, and we're done
buf[0] = buf[0][(marker_index + len(marker)):]
def consume_headers(buf, document_iter):
# If your message is using non-CRLF line endings, you need to go fix it.
# This isn't one of those permissive parsers that eats any old garbage
# thrown at it. This parser can handle RFC2046-compliant messages, and
# that's all it's designed to do.
end_headers_index = buf[0].find('\r\n\r\n')
while end_headers_index < 0:
buf[0] += next(document_iter)
if len(buf[0]) > MAX_HEADERS_SIZE:
raise ValueError("Headers too big! Buffered %d bytes, "
"but only allowed %d" % (len(buf[0]),
MAX_HEADERS_SIZE))
end_headers_index = buf[0].find('\r\n\r\n')
split_point = end_headers_index + 4 # 4 = len('\r\n\r\n')
headers = buf[0][:split_point]
buf[0] = buf[0][split_point:]
return dict(email.parser.Parser().parsestr(headers).items())
def multipart_mime_parser_generator(document_iter):
"""
Process a document as multipart/mime in a streaming fashion.
Returns an iterable of (header, part_iter) pairs.
The caller is responsible for completely consuming each part_iter prior to
requesting the next (header, part_iter) pair.
:param document_iter: iterable yielding chunks of the document
"""
buf = ['']
# First, get the boundary. Can't parse the rest of the message until we
# know what the boundary is.
boundary = extract_boundary(buf, document_iter)
part_boundary = "\r\n--%s" % boundary
for _junk in consume_until(part_boundary, buf, document_iter):
pass
while True:
# Now we're looking at the end of a delimiter line; we've consumed the
# "--yurts" part. Now we need to see if we have "--\r\n", which would
# indicate end-of-message ("--yurts--\r\n" is an end delimiter, which
# terminates the message) or if we simply have \r\n, which is a plain
# separator ("--yurts\r\n" separates message parts).
#
# According to RFC 2046, there's a "transport-padding" that comes
# before \r\n and consists of linear white space (space and tab).
# However, it also says that though composers MUST NOT generate
# non-empty transport padding, consumers MUST be able to handle
# padding added by message transports. However, we're stuffing data
# through a TCP socket; we're not emailing it or posting it to Usenet.
# Let's just pretend we didn't read that part of the RFC and assume
# there's never any transport padding.
while len(buf[0]) < 4: # 4 = len("--\r\n")
buf[0] += next(document_iter)
if buf[0].startswith("--\r\n"):
# Found an end delimiter; we're done.
return
elif buf[0].startswith("\r\n"):
# Found a delimiter; clean it out of the buffer and keep parsing
buf[0] = buf[0][2:]
else:
# Neither a delimiter nor a close delimiter; this document is
# malformed.
raise ValueError("Malformed delimiter in %r" % buf[0])
headers = consume_headers(buf, document_iter)
part_iter = consume_until(part_boundary, buf, document_iter)
yield (headers, part_iter)
if __name__ == '__main__':
for fname in sys.argv[1:]:
with open(fname) as fp:
reader = iter(lambda: fp.read(random.randint(1, 1024)), '')
gen = multipart_mime_parser_generator(reader)
for h, b in gen:
pprint.pprint(h)
pprint.pprint(list(b))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment