-
-
Save georgy7/3a80bce2cd8bf2f9985c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Modified. | |
# Original script source: | |
# http://blog.marcbelmont.com/2012/10/script-to-extract-email-attachments.html | |
# https://web.archive.org/web/20150312172727/http://blog.marcbelmont.com/2012/10/script-to-extract-email-attachments.html | |
# Usage: | |
# Run the script from a folder with file "all.mbox" | |
# Attachments will be extracted into subfolder "attachments" | |
# with prefix "m " where m is a message ID in mbox file. | |
# Or | |
# ./extract_mbox_attachments.py -i first.mbox -o attachments1/ | |
# ./extract_mbox_attachments.py -i second.mbox -o attachments2/ | |
# ./extract_mbox_attachments.py --help | |
# --------------- | |
# Please check the unpacked files | |
# with an antivirus before opening them! | |
# --------------- | |
# I make no representations or warranties of any kind concerning | |
# the software, express, implied, statutory or otherwise, | |
# including without limitation warranties of title, merchantability, | |
# fitness for a particular purpose, non infringement, or the | |
# absence of latent or other defects, accuracy, or the present or | |
# absence of errors, whether or not discoverable, all to the | |
# greatest extent permissible under applicable law. | |
import errno | |
import mailbox | |
import mimetypes | |
import os | |
import pathlib # since Python 3.4 | |
import re | |
import traceback | |
from email.header import decode_header | |
import argparse | |
import sys | |
def parse_options(args=[]): | |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-i', '--input', default='all.mbox', help='Input file') | |
parser.add_argument('-o', '--output', default='attachments/', help='Output folder') | |
parser.add_argument('--no-inline-images', action='store_true') | |
parser.add_argument('--start', | |
type=message_id_type, default=0, | |
help='On which message to start') | |
parser.add_argument('--stop', | |
type=message_id_type, default=100000000000, | |
help='On which message to stop, not included') | |
return parser.parse_args(args) | |
def message_id_type(arg): | |
try: | |
i = int(arg) | |
except ValueError as e: | |
raise argparse.ArgumentTypeError(str(e)) | |
if i < 0: | |
raise argparse.ArgumentTypeError("Must be greater than or equal 0.") | |
return i | |
class Extractor: | |
def __init__(self, options): | |
self.__total = 0 | |
self.__failed = 0 | |
self.options = options | |
assert os.path.isfile(options.input) | |
self.mbox = mailbox.mbox(options.input) | |
if not os.path.exists(options.output): | |
os.makedirs(options.output) | |
self.inline_image_folder = os.path.join(options.output, 'inline_images/') | |
if (not options.no_inline_images) and (not os.path.exists(self.inline_image_folder)): | |
os.makedirs(self.inline_image_folder) | |
def increment_total(self): | |
self.__total += 1 | |
def increment_failed(self): | |
self.__failed += 1 | |
def get_total(self): | |
return self.__total | |
def get_failed(self): | |
return self.__failed | |
def to_file_path(save_to, name): | |
return os.path.join(save_to, name) | |
def get_extension(name): | |
extension = pathlib.Path(name).suffix | |
return extension if len(extension) <= 20 else '' | |
def resolve_name_conflicts(save_to, name, file_paths, attachment_number): | |
file_path = to_file_path(save_to, name) | |
START = 1 | |
iteration_number = START | |
while os.path.normcase(file_path) in file_paths: | |
extension = get_extension(name) | |
iteration = '' if iteration_number <= START else ' (%s)' % iteration_number | |
new_name = '%s attachment %s%s%s' % (name, attachment_number, iteration, extension) | |
file_path = to_file_path(save_to, new_name) | |
iteration_number += 1 | |
file_paths.append(os.path.normcase(file_path)) | |
return file_path | |
# Whitespaces: tab, carriage return, newline, vertical tab, form feed. | |
FORBIDDEN_WHITESPACE_IN_FILENAMES = re.compile('[\t\r\n\v\f]+') | |
OTHER_FORBIDDEN_FN_CHARACTERS = re.compile('[/\\\\\\?%\\*:\\|"<>\0]') | |
def filter_fn_characters(s): | |
result = re.sub(FORBIDDEN_WHITESPACE_IN_FILENAMES, ' ', s) | |
result = re.sub(OTHER_FORBIDDEN_FN_CHARACTERS, '_', result) | |
return result | |
def decode_filename(part, fallback_filename, mid): | |
if part.get_filename() is None: | |
print('Filename is none: %s %s.' % (mid, fallback_filename)) | |
return fallback_filename | |
else: | |
decoded_name = decode_header(part.get_filename()) | |
if isinstance(decoded_name[0][0], str): | |
return decoded_name[0][0] | |
else: | |
try: | |
name_encoding = decoded_name[0][1] | |
return decoded_name[0][0].decode(name_encoding) | |
except: | |
print('Could not decode %s %s attachment name.' % (mid, fallback_filename)) | |
return fallback_filename | |
def write_to_disk(part, file_path): | |
with open(file_path, 'wb') as f: | |
f.write(part.get_payload(decode=True)) | |
def save(extractor, mid, part, attachments_counter, inline_image=False): | |
extractor.increment_total() | |
try: | |
if inline_image: | |
attachments_counter['inline_image'] += 1 | |
attachment_number_string = 'ii' + str(attachments_counter['inline_image']) | |
destination_folder = extractor.inline_image_folder | |
else: | |
attachments_counter['value'] += 1 | |
attachment_number_string = str(attachments_counter['value']) | |
destination_folder = extractor.options.output | |
filename = decode_filename( | |
part, | |
attachment_number_string + str(mimetypes.guess_extension(part.get_content_type()) or ''), | |
mid) | |
filename = filter_fn_characters(filename) | |
filename = '%s %s' % (mid, filename) | |
previous_file_paths = attachments_counter['file_paths'] | |
try: | |
write_to_disk(part, resolve_name_conflicts( | |
destination_folder, filename, | |
previous_file_paths, | |
attachment_number_string)) | |
except OSError as e: | |
if e.errno == errno.ENAMETOOLONG: | |
short_name = '%s %s%s' % (mid, attachment_number_string, get_extension(filename)) | |
write_to_disk(part, resolve_name_conflicts( | |
destination_folder, short_name, | |
previous_file_paths, | |
attachment_number_string)) | |
else: | |
raise | |
except: | |
traceback.print_exc() | |
extractor.increment_failed() | |
def check_part(extractor, mid, part, attachments_counter): | |
mime_type = part.get_content_type() | |
if part.is_multipart(): | |
for p in part.get_payload(): | |
check_part(extractor, mid, p, attachments_counter) | |
elif (part.get_content_disposition() == 'attachment') \ | |
or ((part.get_content_disposition() != 'inline') and (part.get_filename() is not None)): | |
save(extractor, mid, part, attachments_counter) | |
elif (mime_type.startswith('application/') and not mime_type == 'application/javascript') \ | |
or mime_type.startswith('model/') \ | |
or mime_type.startswith('audio/') \ | |
or mime_type.startswith('video/'): | |
message_id_content_type = 'Message id = %s, Content-type = %s.' % (mid, mime_type) | |
if part.get_content_disposition() == 'inline': | |
print('Extracting inline part... ' + message_id_content_type) | |
else: | |
print('Other Content-disposition... ' + message_id_content_type) | |
save(extractor, mid, part, attachments_counter) | |
elif (not extractor.options.no_inline_images) and mime_type.startswith('image/'): | |
save(extractor, mid, part, attachments_counter, True) | |
def process_message(extractor, mid): | |
msg = extractor.mbox.get_message(mid) | |
if msg.is_multipart(): | |
attachments_counter = { | |
'value': 0, | |
'inline_image': 0, | |
'file_paths': [] | |
} | |
for part in msg.get_payload(): | |
check_part(extractor, mid, part, attachments_counter) | |
def extract_mbox_file(options): | |
extractor = Extractor(options) | |
print() | |
for i in range(options.start, options.stop): | |
try: | |
process_message(extractor, i) | |
except KeyError: | |
print('The whole mbox file was processed.') | |
break | |
if i % 1000 == 0: | |
print('Messages processed: {}'.format(i)) | |
print() | |
print('Total files: %s' % extractor.get_total()) | |
print('Failed: %s' % extractor.get_failed()) | |
if __name__ == "__main__": | |
extract_mbox_file(parse_options(sys.argv[1:])) |
Thanks @georgy7
This fork attempts to modify the date of the saved file to match the date of the message:
https://gist.github.com/janko-js/d61cd1cd41c094905c9f5fb6fb55fca0
It was harder than I thought as the formatting of the date can vary.
Thank you all for your feedback. Unfortunately, I don't have enough free time, and it's unlikely that I'll be able to figure out the intricacies of various email formats in the coming weeks. When I started writing this script, I had no idea that they could be so different.
I haven't expected from you to do anything, it worked for my case, and I've posted that change here for anybody who can find that modification useful. It's most probably not a perfect solution, but for the use case simpler to or comparable to that which I've had could be enough.
Maybe @georgy7 or someone else (if he doesn't have the free time) can put it into a regular repository and we can get some group development/collaboration going.
I used it recently and made a modification for my use case, I don't think I will need some changes soon, but if I do I'll check here for the most recent version.
Then, there's also:
https://devblogs.microsoft.com/oldnewthing/20110118-00/?p=11733
@janko-js I meant, in general. For instance, about what is described in the @ChrisCheney message.
Also, I had plans to make another script based on this, indexing an mbox file into an SQLite database. It will be difficult if such a simple thing as the date of the message can be recorded in several ways.
It will be difficult if such a simple thing as the date of the message can be recorded in several ways.
It seems it is so. My solution was very "ad hoc". But apparently there is a recommended way. The RFC that covers the Date format is
https://datatracker.ietf.org/doc/html/rfc2822#section-3.3
My way to cope with some e-mails that I've wanted to process was to eventually, if nothing else worked, call "dateutil.parser.parse" with the initialization for the zones I've seen. That RFC seems to recommend to ignore "obsolete" tz names which aren't EDT EST CDT CST MDT MST PDT PST but my case was none of these but another few, so now I'd personally, if I'd now try to make a general solution, try to correctly initialize all from: https://www.timeanddate.com/time/zones/ for such cases. Note that a good initialization can probably use same tzinfo for both DST and non-DST version, and that the tzinfo data which can be used with ".parse" are potentially "smarter" than a simple offset. The good thing is that the e-mails which most will ever have to process are never older than e.g. 1990 so all differences in time zones before are for most scenarios irrelevant.
Also, thanks for mentioning that, it seems I haven't used the fix by @ChrisCheney , thinking for some reason that your version was recent enough and missed that.
or someone else
@michealespinola, perhaps this is the best solution if you are interested in fast progress. Although I'm a little sorry to lose control of the project, the truth is that in the near future I won't even be able to delve into pull requests and make meaningful decisions on them.
Or I can create a project in my account, but give admin rights to someone who wants to maintain it.
Regarding the license: I vote for zlib/libpng.
Now that I've checked, it seems there is some code which is less ad-hoc and which hopefully covers at least exactly that what's described in RFCs:
https://docs.python.org/3/library/email.utils.html
email.utils.parsedate
which I haven't tried, but it can still be that some senders constructed their date times differently than what's handled there. I still believe the careful checks and covering the cases which exist in "real life" as found in the collected e-mails through the different years/decades are important.
I've created the repository. I sent out invitations to, as far as I understand, direct access to the repository to several people at once. Hopefully it won't cause too much chaos.
I didn't mention at least @dasebasto and @janko-js in the authors, because your changes are not included yet.
@marcbelmont, @ChrisCheney, @michealespinola, @nealmcb and others,
please let me know if you agree with its license. Perhaps you would prefer something else.
@georgy7 Wow, I completely forgot that once upon a time I wrote this. Thanks for making all those changes. I'm glad there's now a repository.
thanks for the invitation @georgy7 . I'm assigning any possible copyright of my modifications of your script to your new project, in case you use anything of these modifications -- I consider them only as good enough for somebody who is willing to modify the code, if needed, while using it, not as any "final just use and don't care" solution in any way. The reason: The modifications aren't covering all the possible formats per RFCs: I personally think whoever agreed with that specification at that time was very unreasonable(1) Additionally as far as I understand there are some clients which are not following even these RFCs. I've had e-mails with the date times like
Sat Apr 11 20:22:37 CEST 2015
and if I understand that is not covered per RFCs, but a date that can be clearly interpreted. On another side RFCs allow something like
Mon (Lundi), 4(quatre)May (Mai) 1998(1998-05-04)03 : 04 : 12 +0000
that I haven't seen and I hope I will not. And if I understand the RFCs it's even worse, like even something similar to this is acceptable per RFCs:
Mon (Lundi
), 4(
quatre)May (Mai) 1998(1998-05-04)03
(bla) :
(bla)04 (bla):
12 +0000
As it's potentially to much work to try to solve "everything forever", I plan to modify my version "with dates" only to adjust to the formats actually seen in my own mails, the next time I process them. My current guess is that email.utils.parsedate should be used as much as possible (as it probably solves the "comments"?), but if it's not covering what is sometimes actually used it's still not enough, and that when it fails even dateutil.parser.parse would not cover the allowed comments and the possible new lines(?!) in the body of the date, so probably email.utils.parsedate would have to improve to cover whatever really exists and not only what is recommended in RFCs. Which I believe won't be an accepted attitude in Python circles which traditionally prefer prescriptions to making life easier by covering and cleaning up the real life cases, which also fits with RFC suggesting discarding the info about the time zone even if it is to the common knowledge unique enough to be clear which it is (which is the opposite of what I'd do to resolve the content of my own inboxes, like I've done with using dateutil (and it worked there!) ).
So my impression at the moment is that if one would like a "very robust" Date solution I guess somebody would have to make a local routine that does much of the email.utils.parsedate and, just in case (to filter out all the possible new lines and comments), and then even more to cover the formats actually produced but maybe not handled by parsedate if that one sticks to RFCs. Maybe I'm wrong but I believe I don't have enough examples to verify -- as in, only people with an access to enough gigabytes of mails spanning from different times and locations would have the enough real life test cases. Which to me says: without the resources like in some big company, solving "ad hoc" and accumulating improvements slowly, at the times somebody has some new problematic input is the "cheapest" way for a script like this.
I liked it as a gist, it told me that it shouldn't be considered "too serious" as a big project would (i.e that I will probably need to adjust it for my own purposes). Once one makes a "project" then it can potentially come "out of control" with trying to solve more than is manageable? So to sum, I assign my modifications to that gist to you, if you use them, please use/distribute them without even mentioning me or asking me for a permission. I'll do the same (gift you the changes) in case your project takes off and I make any modification of it.
- "If you're a new implementor, you'll be shocked at how badly 822 was designed. Extracting even the simplest information from a message---the author's address, for example, or the sending date---is excruciatingly painful. And I see no sign that we'll ever be rid of the horrors of 822 syntax", djb : https://cr.yp.to/immhf.html
Maybe I'm wrong but I believe I don't have enough examples to verify
@janko-js
Recently, I found out that there are mbox files called "mime-torture", small sets of extremely diverse examples.
For instance, Mark Crispin's MIME torture test
Although I'm a little sorry to lose control of the project
Please don't be sorry, @georgy7. What you have done here has been extremely help for many of us, and I for one am very grateful to you. Projects running away or just becoming too intrusive on our personal lives is a real thing. Its nothing to be sorry about. Its precisely why most GitHub projects have multiple collaborators.
Recently, I found out that there are mbox files called "mime-torture", small sets of extremely diverse examples.
For instance, Mark Crispin's MIME torture test
@georgy7 thanks for that. My guess is that the python libraries on which your project depends would fail most of the real "torture" tests -- I've tried to process that one with your script and it exited before seeing a single message? Thunderbird, unless I've made some error trying, also wasn't able to see any. There, the dates appear relatively comparable to what I've seen in my "normal" messages, but that specific "torture" doesn't try to cover what the RFCs clearly consider acceptable (comments even inside of the time part, new lines!) . Seeing that, I suspect the tests actually used, even for the bigger projects, also aren't exhaustive, and more "has anybody complained".
(I knew about https://en.wikipedia.org/wiki/Qmail and djb, but only after I've seen the details of how "illogical" the RFCs are I've understood why it was clearly hard to make a secure e-mail infrastructure.)
Just work! Thanks a lot
@georgy7 thank you for this, extremely usefull
@georgy7 Thank you! This is just what I needed.
thank you , amazing, extracted 150ish excel files in seconds.
My own attempt in swift failed and i ended up with 800k files. lol. THank you again
@georgy7 Thank You! It took out 30000+ files in just 5 mins
Thank you so much for this, worked perfect! 1k files took mere seconds. I tried so many other methods and was striking out, again many thanks.
Amazing work @georgy7, script works perfect.