Created
November 19, 2016 08:17
-
-
Save eliask/3dddfe40a3133d4913b829b36b191419 to your computer and use it in GitHub Desktop.
Playing with gmail mbox dump from Google Takeout (writes all attachments as separate files etc under /tmp)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# Usage: mbox_stuff.py All\ mail\ Including\ Spam\ and\ Trash.mbox | |
import mailbox | |
import traceback | |
import sys | |
from itertools import chain | |
num = 1 | |
def get_parts(msg,n=0): | |
global num | |
num += 1 | |
if n == 2: # too lazy to look up specs. But seems to result in looping otherwise :| | |
return | |
if msg.is_multipart(): | |
for part in msg.walk(): | |
yield from get_parts(part,n+1) | |
else: | |
type_ = msg.get_content_type() | |
payload = msg.get_payload(decode=True) | |
if type_.startswith('text/'): | |
yield payload, msg.get_charset() | |
ext = type_.split('/')[-1] | |
with open('/tmp/msg%06d.%s' % (num, ext), 'wb') as fh: | |
fh.write(payload) | |
def get_body_parts(msg): | |
all_charsets = {x for x in msg.get_charsets() if x} | |
for B, charset1 in get_parts(msg): | |
for charset in chain([charset1] if charset1 else [], all_charsets, ['utf-8']): | |
try: | |
yield B.decode(charset) | |
except Exception as e: | |
traceback.print_exc(file=sys.stdout) | |
mboxfile = sys.argv[1] | |
for i, mail in enumerate(mailbox.mbox(mboxfile)): | |
print('=================', i, '=====================') | |
try: | |
print('''Subject: {} | |
From: {from} | |
'''.format(mail['subject'], mail['From'])) | |
except: | |
pass | |
for body in get_body_parts(mail): | |
print(body) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment