Last active
July 3, 2024 04:17
-
-
Save drinkcat/292257bcbc9d36751d688c06304225b9 to your computer and use it in GitHub Desktop.
Split gmail takeout MBOX file per year
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Split a mbox file per year | |
import os | |
import sys | |
MATCH = b'\r\n\r\nFrom ' | |
OFFSET = 4 # 2 pairs of \r\n above | |
BUFSIZE = 1024*1024 | |
outfiles = {} | |
def parse_message(buffer, start, end): | |
print(f'Message {start}-{end}') | |
headend = buffer.find(b'\r\n', start) | |
if headend < start: | |
raise Exception('Can\'t find header.') | |
header = buffer[start:headend].decode('utf-8') | |
# TODO: Would be more proper to pass 2 as second | |
# parameter and parse the date properly. | |
headerdata = header.split(' ') #, 2 | |
if headerdata[0] != "From": | |
raise Exception(f'Bad header /{header}/.') | |
print(f"{headerdata[1]} -- {headerdata[-1]}") | |
# Just use the year | |
outfile = headerdata[-1] | |
if not outfile in outfiles: | |
outfiles[outfile] = open(output + "/" + outfile, "wb") | |
outfiles[outfile].write(buffer[start:end]) | |
if len(sys.argv) != 3: | |
print("Usage: python split.py input.mbox output") | |
exit() | |
filename = sys.argv[1] | |
output = sys.argv[2] | |
if not os.path.isdir(output): | |
os.mkdir(output) | |
lastbuf = b'' | |
# Parse input | |
infile = open(filename, 'rb') | |
while True: | |
buf = infile.read(BUFSIZE) | |
if not buf: | |
break | |
buf = lastbuf + buf | |
i = 0 | |
while True: | |
# Find next | |
nexti = buf.find(MATCH, i) | |
if nexti < 0: | |
break | |
nexti += OFFSET | |
# Parse | |
parse_message(buf, i, nexti) | |
i = nexti | |
lastbuf = buf[i:len(buf)] | |
# Last message | |
if len(lastbuf) > 0: | |
parse_message(lastbuf, 0, len(lastbuf)) | |
# Close all FD | |
for outfile in outfiles: | |
outfiles[outfile].close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment