Last active
August 29, 2015 14:11
-
-
Save pedramamini/03e3a42d521a6a7edf36 to your computer and use it in GitHub Desktop.
Break an mbox file out into multiple files by year. Written to chunk my GMail Takeout into manageable pieces.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
What | |
Break an mbox file out into multiple files by year. | |
Written to chunk my GMail Takeout into manageable pieces. | |
Will prefix YEAR- to mbox name and *append* to those file names. | |
Who | |
Pedram Amini | |
http://pedramamini.com | |
https://gist.github.com/pedramamini/03e3a42d521a6a7edf36 | |
Usage | |
split_mbox_by_year <input_mbox> | |
Example | |
$ ./split_mbox_by_year.py GMail.mbox | |
[**] processing GMail.mbox... | |
[**] setting output spool 2014-GMail.mbox. | |
[**] closing output spool 2014-GMail.mbox with 3248 messages. | |
[**] setting output spool 2013-GMail.mbox. | |
[**] closing output spool 2013-GMail.mbox with 7 messages. | |
[**] setting output spool 2014-GMail.mbox. | |
[**] closing output spool 2014-GMail.mbox with 3306 messages. | |
[**] setting output spool 2013-GMail.mbox. | |
[**] closing output spool 2013-GMail.mbox with 8 messages. | |
[**] setting output spool 2014-GMail.mbox. | |
[**] closing output spool 2014-GMail.mbox with 3593 messages. | |
[**] setting output spool 2013-GMail.mbox. | |
[**] closing output spool 2013-GMail.mbox with 10 messages. | |
[**] setting output spool 2014-GMail.mbox. | |
[**] closing output spool 2014-GMail.mbox with 6075 messages. | |
[**] setting output spool 2013-GMail.mbox. | |
[**] closing output spool 2013-GMail.mbox with 12 messages. | |
... | |
[**] closing output spool 2013-GMail.mbox with 23181 messages. | |
[**] setting output spool 2012-GMail.mbox. | |
[**] closing output spool 2012-GMail.mbox with 23689 messages. | |
[**] setting output spool 2007-GMail.mbox. | |
[**] closing output spool 2007-GMail.mbox with 3 messages. | |
[**] setting output spool 2012-GMail.mbox. | |
[**] completed processing 73151 messages into 4 spools in 213 seconds. | |
""" | |
import time | |
import sys | |
import os | |
import re | |
def USAGE (): | |
sys.stderr.write("Usage: %s <input_mbox>\n" % __file__) | |
sys.exit(1) | |
def dbg (msg, newline=True, wrap=False): | |
if newline and not wrap: | |
msg += "\n" | |
if wrap: | |
msg += "\r" | |
sys.stdout.write("[**] %s" % msg) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
USAGE() | |
input_mbox_path = sys.argv.pop() | |
input_mbox_dir = os.path.dirname(input_mbox_path) | |
input_mbox_base = os.path.basename(input_mbox_path) | |
if not os.path.exists(input_mbox_path): | |
sys.stderr.write("[!!] mbox not found: %s\n" % input_mbox_path) | |
USAGE() | |
with open(input_mbox_path) as fh: | |
if not fh.read(4) == "From": | |
sys.stderr.write("[!!] %s does not look like a valid mbox file" % input_mbox_path) | |
USAGE() | |
dbg("processing %s..." % input_mbox_path) | |
spool_year = 0 | |
spool_path = None | |
spool_handle = None | |
spool_counts = {} | |
msg_count = 0 | |
start_time = int(time.time()) | |
with open(input_mbox_path) as mbox: | |
while 1: | |
try: | |
line = mbox.next() | |
except StopIteration: | |
break | |
msg_start = re.match("^From .* (\d{4})$", line.strip()) | |
if msg_start: | |
msg_count += 1 | |
dbg("processing message #%d" % msg_count, wrap=True) | |
year = msg_start.groups()[0] | |
if year != spool_year: | |
if spool_handle: | |
spool_handle.close() | |
dbg("closing output spool %s with %d messages." % (spool_path, spool_counts[spool_year])) | |
spool_year = year | |
spool_path = os.path.join(input_mbox_dir, "%s-%s" % (spool_year, input_mbox_base)) | |
spool_handle = open(spool_path, "a+") | |
dbg("setting output spool %s." % spool_path) | |
spool_counts[spool_year] = spool_counts.get(spool_year, 0) + 1 | |
spool_handle.write(line) | |
delta = int(time.time()) - start_time | |
dbg("completed processing %d messages into %d spools in %d seconds." % (msg_count, len(spool_counts), delta)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment