smerritt · August 29, 2015 14:05
diff --git a/mimestream.py b/mimestream.py
 #!/usr/bin/env python
 #
 # An overly-strict multipart-MIME parser thingy. Works for my contrived
 # examples, at least. Does things in a streaming fashion so you can deal with
 # large (multi-GB) multipart MIME messages without using all your memory.
 import email.parser
 import pprint
 import random
 import sys


 MAX_HEADERS_SIZE = 1024 * 1024  # 1 MiB


 def extract_boundary(buf, document_iter):
    """
    Consume the headers off a multipart MIME message, finds the boundary.

    :param message: string containing the first part of a multipart MIME
        message

    :returns: MIME boundary
    """

    headers = consume_headers(buf, document_iter)
    if 'Content-Type' not in headers:
        raise ValueError("couldn't find boundary in %r" % (headers,))
    ct = headers['Content-Type']

    # e.g. Content-Type: multipart/mixed; boundary=97d1eb16bd0e6d559bc30e3c43
    boundary_value = ct.split(';', 1)[1]
    boundary_str = boundary_value.split('=', 1)[1]

    if boundary_str.startswith('"') and boundary_str.endswith('"'):
        return boundary_str[1:-1]
    else:
        return boundary_str


 def consume_until(marker, buf, document_iter):
    """
    Consume the document until a particular marker string is found. Yields
    document chunks before the marker. Note: these document chunks are not
    necessarily the chunks that document_iter yields.
    """

    marker_len = len(marker)

    marker_index = buf[0].find(marker)
    while marker_index < 0:
        buf[0] += next(document_iter)

        # Hold on to enough bytes that we are certain not to miss the marker.
        # If the marker were partially in buf[0] and partially in subsequent
        # chunks, then a simple 'marker in buf[0]' would miss it. Thus, we
        # hang onto some data until we're certain that it doesn't contain the
        # marker, and only then yield it to the caller.
        if len(buf[0]) > marker_len:
            yield buf[0][:marker_len]
            buf[0] = buf[0][marker_len:]

        marker_index = buf[0].find(marker)

    if marker_index > 0:
        yield buf[0][:marker_index]

    # Consume the marker off the input, and we're done
    buf[0] = buf[0][(marker_index + len(marker)):]


 def consume_headers(buf, document_iter):
    # If your message is using non-CRLF line endings, you need to go fix it.
    # This isn't one of those permissive parsers that eats any old garbage
    # thrown at it. This parser can handle RFC2046-compliant messages, and
    # that's all it's designed to do.
    end_headers_index = buf[0].find('\r\n\r\n')
    while end_headers_index < 0:
        buf[0] += next(document_iter)
        if len(buf[0]) > MAX_HEADERS_SIZE:
            raise ValueError("Headers too big! Buffered %d bytes, "
                             "but only allowed %d" % (len(buf[0]),
                                                      MAX_HEADERS_SIZE))
        end_headers_index = buf[0].find('\r\n\r\n')

    split_point = end_headers_index + 4  # 4 = len('\r\n\r\n')
    headers = buf[0][:split_point]
    buf[0] = buf[0][split_point:]

    return dict(email.parser.Parser().parsestr(headers).items())


 def multipart_mime_parser_generator(document_iter):
    """
    Process a document as multipart/mime in a streaming fashion.

    Returns an iterable of (header, part_iter) pairs.

    The caller is responsible for completely consuming each part_iter prior to
    requesting the next (header, part_iter) pair.

    :param document_iter: iterable yielding chunks of the document
    """

    buf = ['']

    # First, get the boundary. Can't parse the rest of the message until we
    # know what the boundary is.
    boundary = extract_boundary(buf, document_iter)

    part_boundary = "\r\n--%s" % boundary

    for _junk in consume_until(part_boundary, buf, document_iter):
        pass

    while True:
        # Now we're looking at the end of a delimiter line; we've consumed the
        # "--yurts" part. Now we need to see if we have "--\r\n", which would
        # indicate end-of-message ("--yurts--\r\n" is an end delimiter, which
        # terminates the message) or if we simply have \r\n, which is a plain
        # separator ("--yurts\r\n" separates message parts).
        #
        # According to RFC 2046, there's a "transport-padding" that comes
        # before \r\n and consists of linear white space (space and tab).
        # However, it also says that though composers MUST NOT generate
        # non-empty transport padding, consumers MUST be able to handle
        # padding added by message transports. However, we're stuffing data
        # through a TCP socket; we're not emailing it or posting it to Usenet.
        # Let's just pretend we didn't read that part of the RFC and assume
        # there's never any transport padding.
        while len(buf[0]) < 4:  # 4 = len("--\r\n")
            buf[0] += next(document_iter)

        if buf[0].startswith("--\r\n"):
            # Found an end delimiter; we're done.
            return
        elif buf[0].startswith("\r\n"):
            # Found a delimiter; clean it out of the buffer and keep parsing
            buf[0] = buf[0][2:]
        else:
            # Neither a delimiter nor a close delimiter; this document is
            # malformed.
            raise ValueError("Malformed delimiter in %r" % buf[0])

        headers = consume_headers(buf, document_iter)
        part_iter = consume_until(part_boundary, buf, document_iter)
        yield (headers, part_iter)


 if __name__ == '__main__':
    for fname in sys.argv[1:]:
        with open(fname) as fp:
            reader = iter(lambda: fp.read(random.randint(1, 1024)), '')
            gen = multipart_mime_parser_generator(reader)
            for h, b in gen:
                pprint.pprint(h)
                pprint.pprint(list(b))
	#!/usr/bin/env python
	#
	# An overly-strict multipart-MIME parser thingy. Works for my contrived
	# examples, at least. Does things in a streaming fashion so you can deal with
	# large (multi-GB) multipart MIME messages without using all your memory.
	import email.parser
	import pprint
	import random
	import sys


	MAX_HEADERS_SIZE = 1024 * 1024 # 1 MiB


	def extract_boundary(buf, document_iter):
	"""
	Consume the headers off a multipart MIME message, finds the boundary.

	:param message: string containing the first part of a multipart MIME
	message

	:returns: MIME boundary
	"""

	headers = consume_headers(buf, document_iter)
	if 'Content-Type' not in headers:
	raise ValueError("couldn't find boundary in %r" % (headers,))
	ct = headers['Content-Type']

	# e.g. Content-Type: multipart/mixed; boundary=97d1eb16bd0e6d559bc30e3c43
	boundary_value = ct.split(';', 1)[1]
	boundary_str = boundary_value.split('=', 1)[1]

	if boundary_str.startswith('"') and boundary_str.endswith('"'):
	return boundary_str[1:-1]
	else:
	return boundary_str


	def consume_until(marker, buf, document_iter):
	"""
	Consume the document until a particular marker string is found. Yields
	document chunks before the marker. Note: these document chunks are not
	necessarily the chunks that document_iter yields.
	"""

	marker_len = len(marker)

	marker_index = buf[0].find(marker)
	while marker_index < 0:
	buf[0] += next(document_iter)

	# Hold on to enough bytes that we are certain not to miss the marker.
	# If the marker were partially in buf[0] and partially in subsequent
	# chunks, then a simple 'marker in buf[0]' would miss it. Thus, we
	# hang onto some data until we're certain that it doesn't contain the
	# marker, and only then yield it to the caller.
	if len(buf[0]) > marker_len:
	yield buf[0][:marker_len]
	buf[0] = buf[0][marker_len:]

	marker_index = buf[0].find(marker)

	if marker_index > 0:
	yield buf[0][:marker_index]

	# Consume the marker off the input, and we're done
	buf[0] = buf[0][(marker_index + len(marker)):]


	def consume_headers(buf, document_iter):
	# If your message is using non-CRLF line endings, you need to go fix it.
	# This isn't one of those permissive parsers that eats any old garbage
	# thrown at it. This parser can handle RFC2046-compliant messages, and
	# that's all it's designed to do.
	end_headers_index = buf[0].find('\r\n\r\n')
	while end_headers_index < 0:
	buf[0] += next(document_iter)
	if len(buf[0]) > MAX_HEADERS_SIZE:
	raise ValueError("Headers too big! Buffered %d bytes, "
	"but only allowed %d" % (len(buf[0]),
	MAX_HEADERS_SIZE))
	end_headers_index = buf[0].find('\r\n\r\n')

	split_point = end_headers_index + 4 # 4 = len('\r\n\r\n')
	headers = buf[0][:split_point]
	buf[0] = buf[0][split_point:]

	return dict(email.parser.Parser().parsestr(headers).items())


	def multipart_mime_parser_generator(document_iter):
	"""
	Process a document as multipart/mime in a streaming fashion.

	Returns an iterable of (header, part_iter) pairs.

	The caller is responsible for completely consuming each part_iter prior to
	requesting the next (header, part_iter) pair.

	:param document_iter: iterable yielding chunks of the document
	"""

	buf = ['']

	# First, get the boundary. Can't parse the rest of the message until we
	# know what the boundary is.
	boundary = extract_boundary(buf, document_iter)

	part_boundary = "\r\n--%s" % boundary

	for _junk in consume_until(part_boundary, buf, document_iter):
	pass

	while True:
	# Now we're looking at the end of a delimiter line; we've consumed the
	# "--yurts" part. Now we need to see if we have "--\r\n", which would
	# indicate end-of-message ("--yurts--\r\n" is an end delimiter, which
	# terminates the message) or if we simply have \r\n, which is a plain
	# separator ("--yurts\r\n" separates message parts).
	#
	# According to RFC 2046, there's a "transport-padding" that comes
	# before \r\n and consists of linear white space (space and tab).
	# However, it also says that though composers MUST NOT generate
	# non-empty transport padding, consumers MUST be able to handle
	# padding added by message transports. However, we're stuffing data
	# through a TCP socket; we're not emailing it or posting it to Usenet.
	# Let's just pretend we didn't read that part of the RFC and assume
	# there's never any transport padding.
	while len(buf[0]) < 4: # 4 = len("--\r\n")
	buf[0] += next(document_iter)

	if buf[0].startswith("--\r\n"):
	# Found an end delimiter; we're done.
	return
	elif buf[0].startswith("\r\n"):
	# Found a delimiter; clean it out of the buffer and keep parsing
	buf[0] = buf[0][2:]
	else:
	# Neither a delimiter nor a close delimiter; this document is
	# malformed.
	raise ValueError("Malformed delimiter in %r" % buf[0])

	headers = consume_headers(buf, document_iter)
	part_iter = consume_until(part_boundary, buf, document_iter)
	yield (headers, part_iter)


	if __name__ == '__main__':
	for fname in sys.argv[1:]:
	with open(fname) as fp:
	reader = iter(lambda: fp.read(random.randint(1, 1024)), '')
	gen = multipart_mime_parser_generator(reader)
	for h, b in gen:
	pprint.pprint(h)
	pprint.pprint(list(b))