kellerza · January 13, 2025 23:16 · ericmclachlan · Jul 25, 2019 · mark-andrews · May 11, 2021
diff --git a/mbox-extract-attachments3.py b/mbox-extract-attachments3.py
 #!/usr/bin/env python3
 # pylint: disable=invalid-name
 """mbox-extract-attachments3 - Extract attachments from mbox files.

 Good companion for Google Takeout https://takeout.google.com/settings/takeout

 Modified by http://github.com/kellerza from
    https://github.com/PabloCastellano/pablog-scripts/
 - Python3 & linter errors
 - New Filenames
 - MD5 duplicate detection
 - New header function

 Copyright (C) 2012 Pablo Castellano <[email protected]>

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.


 Notes (RFC 1341):
 The use of a Content-Type of multipart in a body part within another multipart
 entity is explicitly allowed. In such cases, for obvious reasons, care must be
 taken to ensure that each nested multipart entity must use a different boundary
 delimiter. See Appendix C for an example of nested multipart entities.
 The use of the multipart Content-Type with only a single body part may be
 useful in certain contexts, and is explicitly permitted.
 The only mandatory parameter for the multipart Content-Type is the boundary
 parameter, which consists of 1 to 70 characters from a set of characters known
 to be very robust through email gateways, and NOT ending with white space. (If
 a boundary appears to end with white space, the white space must be presumed to
 have been added by a gateway, and should be deleted.) It is formally specified
 by the following BNF

 Related RFCs: 2047, 2044, 1522
 """

 import mailbox
 import base64
 import os
 import sys
 import email
 import hashlib

 BLACKLIST = ('signature.asc', 'message-footer.txt', 'smime.p7s')
 VERBOSE = 3

 CNT_ATT = 0  # Count extracted attachment
 CNT_SKP = 0
 MD5S = {}


 def decode_header_str(header):
    """Decode header string."""
    hdrs = email.header.decode_header(header)
    res = ''
    for hdr in hdrs:
        res = res + hdr[0].decode(hdr[1] if hdr[1] else 'utf-8')
    return res


 # pylint: disable=global-statement
 def extract_attachment(payload):
    """Search for filename or find recursively if it's multipart."""
    global CNT_ATT, CNT_SKP

    if payload.is_multipart():
        for payl in payload.get_payload():
            extract_attachment(payl)
        return

    filename = payload.get_filename()
    if filename is None:
        return

    print("Attachment found!")
    if filename.find('=?') != -1:
        filename = decode_header_str(filename)

    if filename in BLACKLIST:
        CNT_SKP = CNT_SKP + 1
        if VERBOSE >= 1:
            print("Skipping {} (blacklist)\n".format(filename))
        return

    content = payload.as_bytes()
    # Skip headers, go to the content
    fhdr = content.find(b'\n\n')
    content = content[fhdr:]

    # if it's base64....
    if payload.get('Content-Transfer-Encoding') == 'base64':
        content = base64.decodebytes(content)
    # quoted-printable
    # what else? ...

    print("Extracting {} ({} bytes)\n".format(filename, len(content)))

    num = 1
    orig_filename = filename

    hsh = hashlib.md5()
    hsh.update(content)
    hdigest = hsh.digest()

    while os.path.exists(filename):
        if MD5S.get(filename, '') == hdigest:
            CNT_SKP = CNT_SKP + 1
            return  # already exist, skip
        filename, ext = os.path.splitext(orig_filename)
        filename = '.'.join([filename, str(num), ext[1:]])
        num = num + 1

    MD5S[filename] = hdigest

    try:
        fptr = open(filename, "wb")
        fptr.write(content)
    except IOError:
        print("Aborted, IOError!!!")
        sys.exit(2)
    finally:
        fptr.close()

    CNT_ATT = CNT_ATT + 1


 def main():
    """Main message decode."""
    print("Extract attachments from mbox files")
    print("Copyright (C) 2012 Pablo Castellano")
    print("This program comes with ABSOLUTELY NO WARRANTY.")
    print("This is free software, and you are welcome to redistribute it "
          "under certain conditions.")
    print()

    if len(sys.argv) < 2 or len(sys.argv) > 3:
        print("Usage: {} <mbox_file> [directory]".format(sys.argv[0]))
        sys.exit(0)

    filename = sys.argv[1]
    directory = os.path.curdir

    if not os.path.exists(filename):
        print("File doesn't exist:", filename)
        sys.exit(1)

    if len(sys.argv) == 3:
        directory = sys.argv[2]
        if not os.path.exists(directory) or not os.path.isdir(directory):
            print("Directory doesn't exist:", directory)
            sys.exit(1)

    mbox = mailbox.mbox(filename)

    os.chdir(directory)

    for i in range(len(mbox)):
        if VERBOSE >= 2:
            print("Analyzing message number", i)

        mes = mbox.get_message(i)

        subject = mes.get('Subject')
        if subject.find('=?') != -1:
            subject = decode_header_str(subject)

        em_from = mes.get('From')
        if em_from.find('=?') != -1:
            em_from = decode_header_str(em_from)

        if VERBOSE >= 2:
            print("{} - From: {}".format(subject, em_from))

        filename = mes.get_filename()

        extract_attachment(mes)

    print("\n--------------")
    print("Total attachments extracted:", CNT_ATT)
    print("Total attachments CNT_SKP:", CNT_SKP)

 main()
	#!/usr/bin/env python3
	# pylint: disable=invalid-name
	"""mbox-extract-attachments3 - Extract attachments from mbox files.

	Good companion for Google Takeout https://takeout.google.com/settings/takeout

	Modified by http://github.com/kellerza from
	https://github.com/PabloCastellano/pablog-scripts/
	- Python3 & linter errors
	- New Filenames
	- MD5 duplicate detection
	- New header function

	Copyright (C) 2012 Pablo Castellano <[email protected]>

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <http://www.gnu.org/licenses/>.


	Notes (RFC 1341):
	The use of a Content-Type of multipart in a body part within another multipart
	entity is explicitly allowed. In such cases, for obvious reasons, care must be
	taken to ensure that each nested multipart entity must use a different boundary
	delimiter. See Appendix C for an example of nested multipart entities.
	The use of the multipart Content-Type with only a single body part may be
	useful in certain contexts, and is explicitly permitted.
	The only mandatory parameter for the multipart Content-Type is the boundary
	parameter, which consists of 1 to 70 characters from a set of characters known
	to be very robust through email gateways, and NOT ending with white space. (If
	a boundary appears to end with white space, the white space must be presumed to
	have been added by a gateway, and should be deleted.) It is formally specified
	by the following BNF

	Related RFCs: 2047, 2044, 1522
	"""

	import mailbox
	import base64
	import os
	import sys
	import email
	import hashlib

	BLACKLIST = ('signature.asc', 'message-footer.txt', 'smime.p7s')
	VERBOSE = 3

	CNT_ATT = 0 # Count extracted attachment
	CNT_SKP = 0
	MD5S = {}


	def decode_header_str(header):
	"""Decode header string."""
	hdrs = email.header.decode_header(header)
	res = ''
	for hdr in hdrs:
	res = res + hdr[0].decode(hdr[1] if hdr[1] else 'utf-8')
	return res


	# pylint: disable=global-statement
	def extract_attachment(payload):
	"""Search for filename or find recursively if it's multipart."""
	global CNT_ATT, CNT_SKP

	if payload.is_multipart():
	for payl in payload.get_payload():
	extract_attachment(payl)
	return

	filename = payload.get_filename()
	if filename is None:
	return

	print("Attachment found!")
	if filename.find('=?') != -1:
	filename = decode_header_str(filename)

	if filename in BLACKLIST:
	CNT_SKP = CNT_SKP + 1
	if VERBOSE >= 1:
	print("Skipping {} (blacklist)\n".format(filename))
	return

	content = payload.as_bytes()
	# Skip headers, go to the content
	fhdr = content.find(b'\n\n')
	content = content[fhdr:]

	# if it's base64....
	if payload.get('Content-Transfer-Encoding') == 'base64':
	content = base64.decodebytes(content)
	# quoted-printable
	# what else? ...

	print("Extracting {} ({} bytes)\n".format(filename, len(content)))

	num = 1
	orig_filename = filename

	hsh = hashlib.md5()
	hsh.update(content)
	hdigest = hsh.digest()

	while os.path.exists(filename):
	if MD5S.get(filename, '') == hdigest:
	CNT_SKP = CNT_SKP + 1
	return # already exist, skip
	filename, ext = os.path.splitext(orig_filename)
	filename = '.'.join([filename, str(num), ext[1:]])
	num = num + 1

	MD5S[filename] = hdigest

	try:
	fptr = open(filename, "wb")
	fptr.write(content)
	except IOError:
	print("Aborted, IOError!!!")
	sys.exit(2)
	finally:
	fptr.close()

	CNT_ATT = CNT_ATT + 1


	def main():
	"""Main message decode."""
	print("Extract attachments from mbox files")
	print("Copyright (C) 2012 Pablo Castellano")
	print("This program comes with ABSOLUTELY NO WARRANTY.")
	print("This is free software, and you are welcome to redistribute it "
	"under certain conditions.")
	print()

	if len(sys.argv) < 2 or len(sys.argv) > 3:
	print("Usage: {} <mbox_file> [directory]".format(sys.argv[0]))
	sys.exit(0)

	filename = sys.argv[1]
	directory = os.path.curdir

	if not os.path.exists(filename):
	print("File doesn't exist:", filename)
	sys.exit(1)

	if len(sys.argv) == 3:
	directory = sys.argv[2]
	if not os.path.exists(directory) or not os.path.isdir(directory):
	print("Directory doesn't exist:", directory)
	sys.exit(1)

	mbox = mailbox.mbox(filename)

	os.chdir(directory)

	for i in range(len(mbox)):
	if VERBOSE >= 2:
	print("Analyzing message number", i)

	mes = mbox.get_message(i)

	subject = mes.get('Subject')
	if subject.find('=?') != -1:
	subject = decode_header_str(subject)

	em_from = mes.get('From')
	if em_from.find('=?') != -1:
	em_from = decode_header_str(em_from)

	if VERBOSE >= 2:
	print("{} - From: {}".format(subject, em_from))

	filename = mes.get_filename()

	extract_attachment(mes)

	print("\n--------------")
	print("Total attachments extracted:", CNT_ATT)
	print("Total attachments CNT_SKP:", CNT_SKP)

	main()
No results found