Last active
September 19, 2018 23:55
-
-
Save staticshock/5749cbcdc41e3ef461d571c40b5f928b to your computer and use it in GitHub Desktop.
Convert mbox data to csv data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import mailbox | |
import email.header | |
import sys | |
import bs4 | |
import re | |
from dateutil.parser import parse as parse_date | |
def run(mbox_path): | |
writer = csv.DictWriter(sys.stdout, ["Date", "From", "Subject", "Body"]) | |
writer.writeheader() | |
for message in mailbox.mbox(mbox_path): | |
writer.writerow({ | |
'From': get_header(message, 'From'), | |
'Subject': get_header(message, 'Subject'), | |
'Date': str(parse_date(get_header(message, 'Date'))), | |
'Body': get_payload(message), | |
}) | |
def get_header(message, header_name): | |
header = message.get(header_name) | |
if header and header.startswith('=?'): | |
# Ignore charset, because yolo. | |
header = "".join( | |
decoded_string | |
for decoded_string, charset in email.header.decode_header(header) | |
) | |
return header | |
def get_payload(message): | |
body = "" | |
for part in message.walk(): | |
text = part.get_payload(decode=True) | |
if not text: | |
continue | |
# Turn HTML into plain text if necessary. If the message is sent as | |
# both plain text and html, this'll result in some duplication in the | |
# output, but... better too much than not enough? | |
html_patterns = ['<html', '<table', '<div'] | |
if any(pattern in text.lower() for pattern in html_patterns): | |
text = bs4.BeautifulSoup(text, "html5lib").text.encode('utf-8') | |
body += text | |
# Collapse some blank lines. Vertical real estate is gold. | |
body = re.sub(r'(\r?\n){2,}', r'\1\1', body).strip() | |
return body | |
run(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment