-
-
Save beamzer/8a1e9629c203eaa9eb8d2fb4725b053a to your computer and use it in GitHub Desktop.
Extract attachments from EML files in the current dir, and write them to the output subdir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3.7 | |
""" | |
source: https://gist.github.com/urschrei/5258588 by Stephan Hügel | |
2020 update: | |
- More iterators, fewer lists | |
- Python 3 compatible | |
- Processes files in parallel | |
(one thread per CPU, but that's not really how it works) | |
Ewald ( https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a ) | |
2020-09-20: | |
- handling of same filenames (write everything because contents might be different) | |
- handling of filenames with * | |
- handling of mkdir errors | |
- added arguments | |
2020-09-23 (v1.2) | |
- version_nr before extension | |
- error handling for utf-8 chars in eml (on error continue) | |
2020-10-02 (v1.3) | |
- now correctly handles RFC2047 MIME encoded filenames | |
2020-10-06 (v1.4) | |
- now handles multi-line filenames | |
- fixed handling of emails with no attachments | |
""" | |
import glob | |
import os | |
import email | |
import argparse | |
from multiprocessing import Pool | |
from cs.rfc2047 import unrfc2047 | |
EXTENSION = "eml" | |
parser = argparse.ArgumentParser(description='extract attachments from eml files') | |
parser.add_argument( | |
'-d','--debug', | |
action='store_true', | |
help='print debug messages to stderr' | |
) | |
parser.add_argument( | |
'-s','--single', | |
action='store_true', | |
help='run as single thread (default = multithreaded, one thread per core)' | |
) | |
parser.add_argument( | |
'-q','--quiet', | |
action='store_true', | |
help='no output' | |
) | |
args = parser.parse_args() | |
debug = args.debug | |
single = args.single | |
quiet = args.quiet | |
debug and print("debug output is active") | |
# ensure that an output dir exists | |
od = "attachments" | |
# the exist_ok=True avoids error messages due to us being multithreaded and race-conditions | |
# that should be no problem since we moved this out of the repetitive extract function | |
os.path.exists(od) or os.makedirs(od,exist_ok=True) | |
def extract(filename): | |
""" | |
Try to extract the attachments from filename | |
""" | |
debug and print("=> reading %s" % filename) | |
output_count = 0 | |
try: | |
with open(filename, "r") as f: | |
try: | |
msg = email.message_from_file(f) | |
nratt = len(msg.get_payload()) | |
# this will be 4000something if no attachments are present | |
if (nratt > 1 and nratt < 20): | |
for attachment in msg.get_payload()[1:]: | |
of = attachment.get_filename() | |
debug and print("attachment name: %s" % of) | |
# handle multi-line strings, and other problematic characters | |
of = of.replace("\n", "") | |
of = of.replace("\t", "_") | |
of = of.replace("\*", "#") | |
# this is to handle RFC2047 MIME encoded filenames (often used for obfuscation) | |
try: | |
output_filename = unrfc2047(of) | |
if ( of != output_filename): | |
debug and print("decoded attachment name: %s" % output_filename) | |
except Exception as inst: | |
print(type(inst)) # the exception instance | |
print(inst.args) # arguments stored in .args | |
print(inst) # __str__ allows args to be printed directly | |
# If no attachments are found, skip this file | |
if output_filename: | |
# check if this filename already exists | |
fn = od + "/" + output_filename | |
debug and print("checking existence of %s" % fn) | |
expand = 0 | |
if os.path.isfile(fn): | |
while True: | |
expand += 1 | |
# add the increment before the filename extension | |
fn_name, fn_ext = os.path.splitext(output_filename) | |
new_filename = fn_name + "_" + str(expand) + fn_ext | |
fn = od + "/" + new_filename | |
if os.path.isfile(fn): | |
continue | |
else: | |
output_filename = new_filename | |
break | |
not(quiet) and print("Writing %s " % output_filename) | |
with open(os.path.join(od, output_filename), "wb") as of: | |
of.write(attachment.get_payload(decode=True)) | |
output_count += 1 | |
if output_count == 0: | |
not(quiet) and print("No attachment found for file %s!" % f.name) | |
except Exception: | |
print('Fail: %s\n' % f) | |
# this should catch read and write errors | |
except IOError: | |
not(quiet) and print("Problem with %s or one of its attachments!" % f.name) | |
return 1, output_count | |
if __name__ == "__main__": | |
if not(single): | |
debug and print("running multithreaded") | |
# let's do this in parallel, using cpu count as number of threads | |
pool = Pool(None) | |
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION)) | |
# need these if we use _async | |
pool.close() | |
pool.join() | |
# 2-element list holding number of files, number of attachments | |
numfiles = [sum(i) for i in zip(*res)] | |
not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles)) | |
else: | |
filecnt = 0 | |
cnt = 0 | |
debug and print("running single threaded") | |
for file in glob.glob("*.%s" % EXTENSION): | |
filecnt += 1 | |
cnt += extract(file)[1] | |
not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @beamzer thanks for this wonderful code
Hi @julianjamespy ,
I had the same problem with .eml embedded in .eml.
For some shady reason, the parser is unable to process adequately .eml attachments.
But at least you can open it as a string or bytes, and you notice that there is kind of weird header like:
"Content-Type: message/rfc822
Content-Disposition: attachment;
creation-date="Fri, 03 Sep 2021 07:45:44 GMT";
modification-date="Fri, 03 Sep 2021 07:45:44 GMT""
So I tried to remove it, by simply removing the first 162 characters of attachments when the attachment is detected with NoneType:
attachment.as_bytes()[162:]
Also, I added a recursive call to extract, to extract the attachments in .eml embedded in .eml
Please find below the reviewed function (working on my side):