Skip to content

Instantly share code, notes, and snippets.

@staticshock
Last active September 19, 2018 23:55
Show Gist options
  • Save staticshock/5749cbcdc41e3ef461d571c40b5f928b to your computer and use it in GitHub Desktop.
Save staticshock/5749cbcdc41e3ef461d571c40b5f928b to your computer and use it in GitHub Desktop.
Convert mbox data to csv data
import csv
import mailbox
import email.header
import sys
import bs4
import re
from dateutil.parser import parse as parse_date
def run(mbox_path):
writer = csv.DictWriter(sys.stdout, ["Date", "From", "Subject", "Body"])
writer.writeheader()
for message in mailbox.mbox(mbox_path):
writer.writerow({
'From': get_header(message, 'From'),
'Subject': get_header(message, 'Subject'),
'Date': str(parse_date(get_header(message, 'Date'))),
'Body': get_payload(message),
})
def get_header(message, header_name):
header = message.get(header_name)
if header and header.startswith('=?'):
# Ignore charset, because yolo.
header = "".join(
decoded_string
for decoded_string, charset in email.header.decode_header(header)
)
return header
def get_payload(message):
body = ""
for part in message.walk():
text = part.get_payload(decode=True)
if not text:
continue
# Turn HTML into plain text if necessary. If the message is sent as
# both plain text and html, this'll result in some duplication in the
# output, but... better too much than not enough?
html_patterns = ['<html', '<table', '<div']
if any(pattern in text.lower() for pattern in html_patterns):
text = bs4.BeautifulSoup(text, "html5lib").text.encode('utf-8')
body += text
# Collapse some blank lines. Vertical real estate is gold.
body = re.sub(r'(\r?\n){2,}', r'\1\1', body).strip()
return body
run(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment