georgy7 · November 14, 2024 08:37 · ghost · May 16, 2023 · georgy7 · May 16, 2023
diff --git a/extract_mbox_attachments.py b/extract_mbox_attachments.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 # Modified.
 # Original script source:
 # http://blog.marcbelmont.com/2012/10/script-to-extract-email-attachments.html
 # https://web.archive.org/web/20150312172727/http://blog.marcbelmont.com/2012/10/script-to-extract-email-attachments.html

 # Usage:
 # Run the script from a folder with file "all.mbox"
 # Attachments will be extracted into subfolder "attachments" 
 # with prefix "m " where m is a message ID in mbox file.

 # Or
 # ./extract_mbox_attachments.py -i first.mbox -o attachments1/
 # ./extract_mbox_attachments.py -i second.mbox -o attachments2/
 # ./extract_mbox_attachments.py --help

 # ---------------
 # Please check the unpacked files
 # with an antivirus before opening them!

 # ---------------
 # I make no representations or warranties of any kind concerning
 # the software, express, implied, statutory or otherwise,
 # including without limitation warranties of title, merchantability,
 # fitness for a particular purpose, non infringement, or the
 # absence of latent or other defects, accuracy, or the present or
 # absence of errors, whether or not discoverable, all to the
 # greatest extent permissible under applicable law.

 import errno
 import mailbox
 import mimetypes
 import os
 import pathlib  # since Python 3.4
 import re
 import traceback
 from email.header import decode_header
 import argparse
 import sys


 def parse_options(args=[]):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i', '--input', default='all.mbox', help='Input file')
    parser.add_argument('-o', '--output', default='attachments/', help='Output folder')
    parser.add_argument('--no-inline-images', action='store_true')
    parser.add_argument('--start',
                        type=message_id_type, default=0,
                        help='On which message to start')
    parser.add_argument('--stop',
                        type=message_id_type, default=100000000000,
                        help='On which message to stop, not included')
    return parser.parse_args(args)


 def message_id_type(arg):
    try:
        i = int(arg)
    except ValueError as e:
        raise argparse.ArgumentTypeError(str(e))
    if i < 0:
        raise argparse.ArgumentTypeError("Must be greater than or equal 0.")
    return i


 class Extractor:
    def __init__(self, options):
        self.__total = 0
        self.__failed = 0

        self.options = options

        assert os.path.isfile(options.input)
        self.mbox = mailbox.mbox(options.input)

        if not os.path.exists(options.output):
            os.makedirs(options.output)

        self.inline_image_folder = os.path.join(options.output, 'inline_images/')
        if (not options.no_inline_images) and (not os.path.exists(self.inline_image_folder)):
            os.makedirs(self.inline_image_folder)

    def increment_total(self):
        self.__total += 1

    def increment_failed(self):
        self.__failed += 1

    def get_total(self):
        return self.__total

    def get_failed(self):
        return self.__failed


 def to_file_path(save_to, name):
    return os.path.join(save_to, name)


 def get_extension(name):
    extension = pathlib.Path(name).suffix
    return extension if len(extension) <= 20 else ''


 def resolve_name_conflicts(save_to, name, file_paths, attachment_number):
    file_path = to_file_path(save_to, name)

    START = 1
    iteration_number = START

    while os.path.normcase(file_path) in file_paths:
        extension = get_extension(name)
        iteration = '' if iteration_number <= START else ' (%s)' % iteration_number
        new_name = '%s attachment %s%s%s' % (name, attachment_number, iteration, extension)
        file_path = to_file_path(save_to, new_name)
        iteration_number += 1

    file_paths.append(os.path.normcase(file_path))
    return file_path


 # Whitespaces: tab, carriage return, newline, vertical tab, form feed.
 FORBIDDEN_WHITESPACE_IN_FILENAMES = re.compile('[\t\r\n\v\f]+')
 OTHER_FORBIDDEN_FN_CHARACTERS = re.compile('[/\\\\\\?%\\*:\\|"<>\0]')


 def filter_fn_characters(s):
    result = re.sub(FORBIDDEN_WHITESPACE_IN_FILENAMES, ' ', s)
    result = re.sub(OTHER_FORBIDDEN_FN_CHARACTERS, '_', result)
    return result


 def decode_filename(part, fallback_filename, mid):
    if part.get_filename() is None:
        print('Filename is none: %s %s.' % (mid, fallback_filename))
        return fallback_filename
    else:
        decoded_name = decode_header(part.get_filename())

        if isinstance(decoded_name[0][0], str):
            return decoded_name[0][0]
        else:
            try:
                name_encoding = decoded_name[0][1]
                return decoded_name[0][0].decode(name_encoding)
            except:
                print('Could not decode %s %s attachment name.' % (mid, fallback_filename))
                return fallback_filename


 def write_to_disk(part, file_path):
    with open(file_path, 'wb') as f:
        f.write(part.get_payload(decode=True))


 def save(extractor, mid, part, attachments_counter, inline_image=False):
    extractor.increment_total()

    try:
        if inline_image:
            attachments_counter['inline_image'] += 1
            attachment_number_string = 'ii' + str(attachments_counter['inline_image'])
            destination_folder = extractor.inline_image_folder
        else:
            attachments_counter['value'] += 1
            attachment_number_string = str(attachments_counter['value'])
            destination_folder = extractor.options.output

        filename = decode_filename(
            part,
            attachment_number_string + str(mimetypes.guess_extension(part.get_content_type()) or ''),
            mid)

        filename = filter_fn_characters(filename)
        filename = '%s %s' % (mid, filename)

        previous_file_paths = attachments_counter['file_paths']

        try:
            write_to_disk(part, resolve_name_conflicts(
                destination_folder, filename,
                previous_file_paths,
                attachment_number_string))
        except OSError as e:
            if e.errno == errno.ENAMETOOLONG:
                short_name = '%s %s%s' % (mid, attachment_number_string, get_extension(filename))
                write_to_disk(part, resolve_name_conflicts(
                    destination_folder, short_name,
                    previous_file_paths,
                    attachment_number_string))
            else:
                raise
    except:
        traceback.print_exc()
        extractor.increment_failed()


 def check_part(extractor, mid, part, attachments_counter):
    mime_type = part.get_content_type()
    if part.is_multipart():
        for p in part.get_payload():
            check_part(extractor, mid, p, attachments_counter)
    elif (part.get_content_disposition() == 'attachment') \
            or ((part.get_content_disposition() != 'inline') and (part.get_filename() is not None)):
        save(extractor, mid, part, attachments_counter)
    elif (mime_type.startswith('application/') and not mime_type == 'application/javascript') \
            or mime_type.startswith('model/') \
            or mime_type.startswith('audio/') \
            or mime_type.startswith('video/'):
        message_id_content_type = 'Message id = %s, Content-type = %s.' % (mid, mime_type)
        if part.get_content_disposition() == 'inline':
            print('Extracting inline part... ' + message_id_content_type)
        else:
            print('Other Content-disposition... ' + message_id_content_type)
        save(extractor, mid, part, attachments_counter)
    elif (not extractor.options.no_inline_images) and mime_type.startswith('image/'):
        save(extractor, mid, part, attachments_counter, True)


 def process_message(extractor, mid):
    msg = extractor.mbox.get_message(mid)
    if msg.is_multipart():
        attachments_counter = {
            'value': 0,
            'inline_image': 0,
            'file_paths': []
        }
        for part in msg.get_payload():
            check_part(extractor, mid, part, attachments_counter)


 def extract_mbox_file(options):
    extractor = Extractor(options)
    print()

    for i in range(options.start, options.stop):
        try:
            process_message(extractor, i)
        except KeyError:
            print('The whole mbox file was processed.')
            break
        if i % 1000 == 0:
            print('Messages processed: {}'.format(i))

    print()
    print('Total files:  %s' % extractor.get_total())
    print('Failed:       %s' % extractor.get_failed())


 if __name__ == "__main__":
    extract_mbox_file(parse_options(sys.argv[1:]))
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	# Modified.
	# Original script source:
	# http://blog.marcbelmont.com/2012/10/script-to-extract-email-attachments.html
	# https://web.archive.org/web/20150312172727/http://blog.marcbelmont.com/2012/10/script-to-extract-email-attachments.html

	# Usage:
	# Run the script from a folder with file "all.mbox"
	# Attachments will be extracted into subfolder "attachments"
	# with prefix "m " where m is a message ID in mbox file.

	# Or
	# ./extract_mbox_attachments.py -i first.mbox -o attachments1/
	# ./extract_mbox_attachments.py -i second.mbox -o attachments2/
	# ./extract_mbox_attachments.py --help

	# ---------------
	# Please check the unpacked files
	# with an antivirus before opening them!

	# ---------------
	# I make no representations or warranties of any kind concerning
	# the software, express, implied, statutory or otherwise,
	# including without limitation warranties of title, merchantability,
	# fitness for a particular purpose, non infringement, or the
	# absence of latent or other defects, accuracy, or the present or
	# absence of errors, whether or not discoverable, all to the
	# greatest extent permissible under applicable law.

	import errno
	import mailbox
	import mimetypes
	import os
	import pathlib # since Python 3.4
	import re
	import traceback
	from email.header import decode_header
	import argparse
	import sys


	def parse_options(args=[]):
	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-i', '--input', default='all.mbox', help='Input file')
	parser.add_argument('-o', '--output', default='attachments/', help='Output folder')
	parser.add_argument('--no-inline-images', action='store_true')
	parser.add_argument('--start',
	type=message_id_type, default=0,
	help='On which message to start')
	parser.add_argument('--stop',
	type=message_id_type, default=100000000000,
	help='On which message to stop, not included')
	return parser.parse_args(args)


	def message_id_type(arg):
	try:
	i = int(arg)
	except ValueError as e:
	raise argparse.ArgumentTypeError(str(e))
	if i < 0:
	raise argparse.ArgumentTypeError("Must be greater than or equal 0.")
	return i


	class Extractor:
	def __init__(self, options):
	self.__total = 0
	self.__failed = 0

	self.options = options

	assert os.path.isfile(options.input)
	self.mbox = mailbox.mbox(options.input)

	if not os.path.exists(options.output):
	os.makedirs(options.output)

	self.inline_image_folder = os.path.join(options.output, 'inline_images/')
	if (not options.no_inline_images) and (not os.path.exists(self.inline_image_folder)):
	os.makedirs(self.inline_image_folder)

	def increment_total(self):
	self.__total += 1

	def increment_failed(self):
	self.__failed += 1

	def get_total(self):
	return self.__total

	def get_failed(self):
	return self.__failed


	def to_file_path(save_to, name):
	return os.path.join(save_to, name)


	def get_extension(name):
	extension = pathlib.Path(name).suffix
	return extension if len(extension) <= 20 else ''


	def resolve_name_conflicts(save_to, name, file_paths, attachment_number):
	file_path = to_file_path(save_to, name)

	START = 1
	iteration_number = START

	while os.path.normcase(file_path) in file_paths:
	extension = get_extension(name)
	iteration = '' if iteration_number <= START else ' (%s)' % iteration_number
	new_name = '%s attachment %s%s%s' % (name, attachment_number, iteration, extension)
	file_path = to_file_path(save_to, new_name)
	iteration_number += 1

	file_paths.append(os.path.normcase(file_path))
	return file_path


	# Whitespaces: tab, carriage return, newline, vertical tab, form feed.
	FORBIDDEN_WHITESPACE_IN_FILENAMES = re.compile('[\t\r\n\v\f]+')
	OTHER_FORBIDDEN_FN_CHARACTERS = re.compile('[/\\\\\\?%\\*:\\\|"<>\0]')


	def filter_fn_characters(s):
	result = re.sub(FORBIDDEN_WHITESPACE_IN_FILENAMES, ' ', s)
	result = re.sub(OTHER_FORBIDDEN_FN_CHARACTERS, '_', result)
	return result


	def decode_filename(part, fallback_filename, mid):
	if part.get_filename() is None:
	print('Filename is none: %s %s.' % (mid, fallback_filename))
	return fallback_filename
	else:
	decoded_name = decode_header(part.get_filename())

	if isinstance(decoded_name[0][0], str):
	return decoded_name[0][0]
	else:
	try:
	name_encoding = decoded_name[0][1]
	return decoded_name[0][0].decode(name_encoding)
	except:
	print('Could not decode %s %s attachment name.' % (mid, fallback_filename))
	return fallback_filename


	def write_to_disk(part, file_path):
	with open(file_path, 'wb') as f:
	f.write(part.get_payload(decode=True))


	def save(extractor, mid, part, attachments_counter, inline_image=False):
	extractor.increment_total()

	try:
	if inline_image:
	attachments_counter['inline_image'] += 1
	attachment_number_string = 'ii' + str(attachments_counter['inline_image'])
	destination_folder = extractor.inline_image_folder
	else:
	attachments_counter['value'] += 1
	attachment_number_string = str(attachments_counter['value'])
	destination_folder = extractor.options.output

	filename = decode_filename(
	part,
	attachment_number_string + str(mimetypes.guess_extension(part.get_content_type()) or ''),
	mid)

	filename = filter_fn_characters(filename)
	filename = '%s %s' % (mid, filename)

	previous_file_paths = attachments_counter['file_paths']

	try:
	write_to_disk(part, resolve_name_conflicts(
	destination_folder, filename,
	previous_file_paths,
	attachment_number_string))
	except OSError as e:
	if e.errno == errno.ENAMETOOLONG:
	short_name = '%s %s%s' % (mid, attachment_number_string, get_extension(filename))
	write_to_disk(part, resolve_name_conflicts(
	destination_folder, short_name,
	previous_file_paths,
	attachment_number_string))
	else:
	raise
	except:
	traceback.print_exc()
	extractor.increment_failed()


	def check_part(extractor, mid, part, attachments_counter):
	mime_type = part.get_content_type()
	if part.is_multipart():
	for p in part.get_payload():
	check_part(extractor, mid, p, attachments_counter)
	elif (part.get_content_disposition() == 'attachment') \
	or ((part.get_content_disposition() != 'inline') and (part.get_filename() is not None)):
	save(extractor, mid, part, attachments_counter)
	elif (mime_type.startswith('application/') and not mime_type == 'application/javascript') \
	or mime_type.startswith('model/') \
	or mime_type.startswith('audio/') \
	or mime_type.startswith('video/'):
	message_id_content_type = 'Message id = %s, Content-type = %s.' % (mid, mime_type)
	if part.get_content_disposition() == 'inline':
	print('Extracting inline part... ' + message_id_content_type)
	else:
	print('Other Content-disposition... ' + message_id_content_type)
	save(extractor, mid, part, attachments_counter)
	elif (not extractor.options.no_inline_images) and mime_type.startswith('image/'):
	save(extractor, mid, part, attachments_counter, True)


	def process_message(extractor, mid):
	msg = extractor.mbox.get_message(mid)
	if msg.is_multipart():
	attachments_counter = {
	'value': 0,
	'inline_image': 0,
	'file_paths': []
	}
	for part in msg.get_payload():
	check_part(extractor, mid, part, attachments_counter)


	def extract_mbox_file(options):
	extractor = Extractor(options)
	print()

	for i in range(options.start, options.stop):
	try:
	process_message(extractor, i)
	except KeyError:
	print('The whole mbox file was processed.')
	break
	if i % 1000 == 0:
	print('Messages processed: {}'.format(i))

	print()
	print('Total files: %s' % extractor.get_total())
	print('Failed: %s' % extractor.get_failed())


	if __name__ == "__main__":
	extract_mbox_file(parse_options(sys.argv[1:]))