Skip to content

Instantly share code, notes, and snippets.

@kellerza
Last active May 11, 2021 16:20
Show Gist options
  • Save kellerza/98f10d65fdcd3584b67cc3fe9eaf9049 to your computer and use it in GitHub Desktop.
Save kellerza/98f10d65fdcd3584b67cc3fe9eaf9049 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# pylint: disable=invalid-name
"""mbox-extract-attachments3 - Extract attachments from mbox files.
Good companion for Google Takeout https://takeout.google.com/settings/takeout
Modified by http://github.com/kellerza from
https://github.com/PabloCastellano/pablog-scripts/
- Python3 & linter errors
- New Filenames
- MD5 duplicate detection
- New header function
Copyright (C) 2012 Pablo Castellano <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Notes (RFC 1341):
The use of a Content-Type of multipart in a body part within another multipart
entity is explicitly allowed. In such cases, for obvious reasons, care must be
taken to ensure that each nested multipart entity must use a different boundary
delimiter. See Appendix C for an example of nested multipart entities.
The use of the multipart Content-Type with only a single body part may be
useful in certain contexts, and is explicitly permitted.
The only mandatory parameter for the multipart Content-Type is the boundary
parameter, which consists of 1 to 70 characters from a set of characters known
to be very robust through email gateways, and NOT ending with white space. (If
a boundary appears to end with white space, the white space must be presumed to
have been added by a gateway, and should be deleted.) It is formally specified
by the following BNF
Related RFCs: 2047, 2044, 1522
"""
import mailbox
import base64
import os
import sys
import email
import hashlib
BLACKLIST = ('signature.asc', 'message-footer.txt', 'smime.p7s')
VERBOSE = 3
CNT_ATT = 0 # Count extracted attachment
CNT_SKP = 0
MD5S = {}
def decode_header_str(header):
"""Decode header string."""
hdrs = email.header.decode_header(header)
res = ''
for hdr in hdrs:
res = res + hdr[0].decode(hdr[1] if hdr[1] else 'utf-8')
return res
# pylint: disable=global-statement
def extract_attachment(payload):
"""Search for filename or find recursively if it's multipart."""
global CNT_ATT, CNT_SKP
if payload.is_multipart():
for payl in payload.get_payload():
extract_attachment(payl)
return
filename = payload.get_filename()
if filename is None:
return
print("Attachment found!")
if filename.find('=?') != -1:
filename = decode_header_str(filename)
if filename in BLACKLIST:
CNT_SKP = CNT_SKP + 1
if VERBOSE >= 1:
print("Skipping {} (blacklist)\n".format(filename))
return
content = payload.as_bytes()
# Skip headers, go to the content
fhdr = content.find(b'\n\n')
content = content[fhdr:]
# if it's base64....
if payload.get('Content-Transfer-Encoding') == 'base64':
content = base64.decodebytes(content)
# quoted-printable
# what else? ...
print("Extracting {} ({} bytes)\n".format(filename, len(content)))
num = 1
orig_filename = filename
hsh = hashlib.md5()
hsh.update(content)
hdigest = hsh.digest()
while os.path.exists(filename):
if MD5S.get(filename, '') == hdigest:
CNT_SKP = CNT_SKP + 1
return # already exist, skip
filename, ext = os.path.splitext(orig_filename)
filename = '.'.join([filename, str(num), ext[1:]])
num = num + 1
MD5S[filename] = hdigest
try:
fptr = open(filename, "wb")
fptr.write(content)
except IOError:
print("Aborted, IOError!!!")
sys.exit(2)
finally:
fptr.close()
CNT_ATT = CNT_ATT + 1
def main():
"""Main message decode."""
print("Extract attachments from mbox files")
print("Copyright (C) 2012 Pablo Castellano")
print("This program comes with ABSOLUTELY NO WARRANTY.")
print("This is free software, and you are welcome to redistribute it "
"under certain conditions.")
print()
if len(sys.argv) < 2 or len(sys.argv) > 3:
print("Usage: {} <mbox_file> [directory]".format(sys.argv[0]))
sys.exit(0)
filename = sys.argv[1]
directory = os.path.curdir
if not os.path.exists(filename):
print("File doesn't exist:", filename)
sys.exit(1)
if len(sys.argv) == 3:
directory = sys.argv[2]
if not os.path.exists(directory) or not os.path.isdir(directory):
print("Directory doesn't exist:", directory)
sys.exit(1)
mbox = mailbox.mbox(filename)
os.chdir(directory)
for i in range(len(mbox)):
if VERBOSE >= 2:
print("Analyzing message number", i)
mes = mbox.get_message(i)
subject = mes.get('Subject')
if subject.find('=?') != -1:
subject = decode_header_str(subject)
em_from = mes.get('From')
if em_from.find('=?') != -1:
em_from = decode_header_str(em_from)
if VERBOSE >= 2:
print("{} - From: {}".format(subject, em_from))
filename = mes.get_filename()
extract_attachment(mes)
print("\n--------------")
print("Total attachments extracted:", CNT_ATT)
print("Total attachments CNT_SKP:", CNT_SKP)
main()
@ericmclachlan
Copy link

Nice dude. Thanks for this. 👍

@mark-andrews
Copy link

Brilliant! Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment