Last active
March 30, 2024 02:40
-
-
Save bonzini/d5bc1946475487167c529f9699e39512 to your computer and use it in GitHub Desktop.
Split a mailbox into separate patch files, stripping the transfer encoding and minimizing the headers along the way.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# mbox_split.py | |
# | |
# Split a mailbox into separate patch files, stripping the transfer encoding | |
# and minimizing the headers along the way. | |
# | |
# Written by Paolo Bonzini <[email protected]> | |
import argparse | |
import re | |
import sys | |
import email.parser, email.header | |
def subj_to_name(subj): | |
"""Convert a subject to a filename.""" | |
# You can write Perl in any language. - Edgar Dijkstra, probably. | |
def dashify(text): | |
text = re.sub("[^a-zA-Z0-9_-]", "-", text) | |
text = re.sub("--+", "-", text) | |
text = re.sub("^[.-]*", "", text) | |
return re.sub("[.-]*$", "", text) | |
subj = re.sub("\n\s+", " ", subj, re.S) | |
m = re.match(r"""\s* (\[ [^]]* \] )""", subj, re.X) | |
num = 1 | |
if m: | |
m2 = re.search(r"""([0-9]+)/[0-9]+""", m.group(0), re.X) | |
if m2: | |
num = int(m2.group(1)) | |
subj = subj[m.end() :] | |
m = re.match(r"""\s* ( \[ [^]]* \] | \S+: )?""", subj, re.X) | |
area = "misc" | |
if m and m.group(1): | |
area = dashify(m.group(1)) | |
subj = subj[m.end() :] | |
text = dashify(subj.strip()) | |
return "%04d-%s-%s.patch" % (num, area, text) | |
def has_patch(body): | |
"""Return whether the body includes a patch.""" | |
return re.search( | |
b"""^---.* ^\\+\\+\\+.* ^@@ | |
|^diff.* ^index.* ^GIT binary patch | |
|^diff.* ^old mode .* ^new mode""", | |
body, | |
re.M | re.S | re.X, | |
) | |
def header_to_string(v): | |
"""Convert a MIME encoded header to Unicode.""" | |
return email.header.make_header(email.header.decode_header(v)) | |
def do_single(msg, outfile=None): | |
"""Remove unnecessary headers from the message as well as | |
content-transfer-encoding, and print it to outfile or to | |
a file whose name is derived from the subject. If the | |
latter, the name of the file is printed to stdout.""" | |
def open_output_file(msg): | |
name = subj_to_name(msg["Subject"]) | |
print(name) | |
return open(name, "wb") | |
container = msg.get_payload(0) if msg.is_multipart() else msg | |
body = container.get_payload(decode=True) | |
if not args.keep_cr: | |
body = body.replace(b"\r\n", b"\n") | |
if not args.nopatch and not has_patch(body): | |
return | |
with outfile or open_output_file(msg) as f: | |
for k in ("From", "Subject", "Date", "Content-Type"): | |
if k in msg: | |
f.write(("%s: %s\n" % (k, header_to_string(msg[k]))).encode()) | |
f.write(b"\n") | |
f.write(body) | |
def split_mbox(stream, func): | |
"""Split an mbox file and pass each part to a function func.""" | |
parser = None | |
for line in stream: | |
if line.startswith(b"From "): | |
# finish the previous message | |
if parser: | |
func(parser.close()) | |
parser = None | |
else: | |
if not parser and line.strip() == b"": | |
continue | |
if line.startswith(b">From"): | |
line = line[1:] | |
if not parser: | |
parser = email.parser.BytesFeedParser() | |
parser.feed(line) | |
if parser: | |
func(parser.close()) | |
parser = argparse.ArgumentParser( | |
description="Splits a given mailbox into separate patch files" | |
) | |
parser.add_argument( | |
"--nopatch", | |
action="store_true", | |
default=False, | |
help="exports even if it's not a patch", | |
) | |
parser.add_argument( | |
"--single", | |
action="store_true", | |
default=False, | |
help="do not split mbox file", | |
) | |
parser.add_argument( | |
"--keep-cr", | |
action="store_true", | |
default=False, | |
help=r"do not remove \r from lines ending with \r\n", | |
) | |
parser.add_argument( | |
"mbox", | |
metavar="<mailbox file>", | |
nargs="?", | |
help='specifies the mailbox file; if "-" or absent, read from stdin', | |
) | |
args = parser.parse_args() | |
if not args.mbox or args.mbox == "-": | |
infile = sys.stdin.buffer | |
else: | |
infile = open(args.mbox, "rb") | |
if args.single: | |
msg = email.parser.BytesParser().parse(infile) | |
do_single(msg, sys.stdout.buffer) | |
else: | |
split_mbox(infile, do_single) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment