Skip to content

Instantly share code, notes, and snippets.

@beamzer
Forked from urschrei/parseml.py
Last active October 30, 2024 14:35
Show Gist options
  • Save beamzer/8a1e9629c203eaa9eb8d2fb4725b053a to your computer and use it in GitHub Desktop.
Save beamzer/8a1e9629c203eaa9eb8d2fb4725b053a to your computer and use it in GitHub Desktop.
Extract attachments from EML files in the current dir, and write them to the output subdir
#!/usr/bin/env python3.7
"""
source: https://gist.github.com/urschrei/5258588 by Stephan Hügel
2020 update:
- More iterators, fewer lists
- Python 3 compatible
- Processes files in parallel
(one thread per CPU, but that's not really how it works)
Ewald ( https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a )
2020-09-20:
- handling of same filenames (write everything because contents might be different)
- handling of filenames with *
- handling of mkdir errors
- added arguments
2020-09-23 (v1.2)
- version_nr before extension
- error handling for utf-8 chars in eml (on error continue)
2020-10-02 (v1.3)
- now correctly handles RFC2047 MIME encoded filenames
2020-10-06 (v1.4)
- now handles multi-line filenames
- fixed handling of emails with no attachments
"""
import glob
import os
import email
import argparse
from multiprocessing import Pool
from cs.rfc2047 import unrfc2047
EXTENSION = "eml"
parser = argparse.ArgumentParser(description='extract attachments from eml files')
parser.add_argument(
'-d','--debug',
action='store_true',
help='print debug messages to stderr'
)
parser.add_argument(
'-s','--single',
action='store_true',
help='run as single thread (default = multithreaded, one thread per core)'
)
parser.add_argument(
'-q','--quiet',
action='store_true',
help='no output'
)
args = parser.parse_args()
debug = args.debug
single = args.single
quiet = args.quiet
debug and print("debug output is active")
# ensure that an output dir exists
od = "attachments"
# the exist_ok=True avoids error messages due to us being multithreaded and race-conditions
# that should be no problem since we moved this out of the repetitive extract function
os.path.exists(od) or os.makedirs(od,exist_ok=True)
def extract(filename):
"""
Try to extract the attachments from filename
"""
debug and print("=> reading %s" % filename)
output_count = 0
try:
with open(filename, "r") as f:
try:
msg = email.message_from_file(f)
nratt = len(msg.get_payload())
# this will be 4000something if no attachments are present
if (nratt > 1 and nratt < 20):
for attachment in msg.get_payload()[1:]:
of = attachment.get_filename()
debug and print("attachment name: %s" % of)
# handle multi-line strings, and other problematic characters
of = of.replace("\n", "")
of = of.replace("\t", "_")
of = of.replace("\*", "#")
# this is to handle RFC2047 MIME encoded filenames (often used for obfuscation)
try:
output_filename = unrfc2047(of)
if ( of != output_filename):
debug and print("decoded attachment name: %s" % output_filename)
except Exception as inst:
print(type(inst)) # the exception instance
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to be printed directly
# If no attachments are found, skip this file
if output_filename:
# check if this filename already exists
fn = od + "/" + output_filename
debug and print("checking existence of %s" % fn)
expand = 0
if os.path.isfile(fn):
while True:
expand += 1
# add the increment before the filename extension
fn_name, fn_ext = os.path.splitext(output_filename)
new_filename = fn_name + "_" + str(expand) + fn_ext
fn = od + "/" + new_filename
if os.path.isfile(fn):
continue
else:
output_filename = new_filename
break
not(quiet) and print("Writing %s " % output_filename)
with open(os.path.join(od, output_filename), "wb") as of:
of.write(attachment.get_payload(decode=True))
output_count += 1
if output_count == 0:
not(quiet) and print("No attachment found for file %s!" % f.name)
except Exception:
print('Fail: %s\n' % f)
# this should catch read and write errors
except IOError:
not(quiet) and print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
if __name__ == "__main__":
if not(single):
debug and print("running multithreaded")
# let's do this in parallel, using cpu count as number of threads
pool = Pool(None)
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
# need these if we use _async
pool.close()
pool.join()
# 2-element list holding number of files, number of attachments
numfiles = [sum(i) for i in zip(*res)]
not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles))
else:
filecnt = 0
cnt = 0
debug and print("running single threaded")
for file in glob.glob("*.%s" % EXTENSION):
filecnt += 1
cnt += extract(file)[1]
not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt))
@delgardoxx
Copy link

Cool fork!
Could you add an argument "-f filename" to extract just one eml file?

Best regards
Delgardo

@sansal54
Copy link

sansal54 commented Jul 28, 2021

Hi. Its quite useful code. Just small issue I have is, when there is an attachment like "Indoor Unit 03 Günlük Rapor_3_20210718_115911.csv" inside eml file, this code extracts it as "Indoor Unit 03 Günlük Rapor_3_20210718_115911.cs v". There is space between s and v at the extension. And when i print the of variable, I get the output as "=?iso-8859-3?Q?Indoor_Unit_03_G=FCnl=FCk_Rapor=5F3=5F20210718=5F115911.cs?=
=?iso-8859-3?Q?v?=". Can you please help with this?

Thanks.

@julianjamespy
Copy link

julianjamespy commented Aug 30, 2021

This is a very well written script but I'm having issue's. Would you be able to point me in the right direction?

I'm currently testing with a folder that has 100 or so .eml files and each file has a .msg file attached within it.

I am getting the following error's for every email inside the folder as it iterates through.

Fail: <_io.TextIOWrapper name='00000463.eml' mode='r' encoding='cp1257'>

Fail: <_io.TextIOWrapper name='00000464.eml' mode='r' encoding='cp1257'>

Done: Processed 100 files with 0 attachments.

Any assistance would be amazing!

Regards,
-Julian

@xiaolongbao-dimsum
Copy link

xiaolongbao-dimsum commented Mar 25, 2022

Hi @beamzer thanks for this wonderful code

Hi @julianjamespy ,
I had the same problem with .eml embedded in .eml.
For some shady reason, the parser is unable to process adequately .eml attachments.

But at least you can open it as a string or bytes, and you notice that there is kind of weird header like:
"Content-Type: message/rfc822
Content-Disposition: attachment;
creation-date="Fri, 03 Sep 2021 07:45:44 GMT";
modification-date="Fri, 03 Sep 2021 07:45:44 GMT""

So I tried to remove it, by simply removing the first 162 characters of attachments when the attachment is detected with NoneType:
attachment.as_bytes()[162:]

Also, I added a recursive call to extract, to extract the attachments in .eml embedded in .eml

Please find below the reviewed function (working on my side):

def extract(filename):
    """
    Try to extract the attachments from all files in cwd
    """
    # ensure that an output dir exists
    od = "output"
    os.path.exists(od) or os.makedirs(od)
    output_count = 0
    try:
        with open(filename, "r") as f:
            msg = email.message_from_file(f, policy=policy.default)
            for attachment in msg.iter_attachments():
                try:
                    output_filename = attachment.get_filename()
                except AttributeError:
                    print("Got string instead of filename for %s. Skipping." % f.name)
                    continue
                # If no attachments are found, skip this file
                if output_filename:
                    with open(os.path.join(od, output_filename), "wb") as of:
                        try:
                            of.write(attachment.get_payload(decode=True))
                            output_count += 1
                        except TypeError:
                            print("Couldn't get payload for %s" % output_filename)
                #for EML embedded in EML
                else:
                    output_filename = filename.split('.')[:-1][0].split('\\')[-1]+'_void_'+str(output_count)+'.eml'
                    with open(os.path.join(od, output_filename), "wb") as of:
                        try:
                            of.write(attachment.as_bytes()[162:])
                            output_count += 1
                            extract(os.path.join(od, output_filename))
                        except TypeError:
                            print("Couldn't get payload for %s" % output_filename)
            if output_count == 0:
                print("No attachment found for file %s!" % f.name)
    # this should catch read and write errors
    except IOError:
        print("Problem with %s or one of its attachments!" % f.name)
    return 1, output_count

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment