-
-
Save beamzer/8a1e9629c203eaa9eb8d2fb4725b053a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3.7 | |
""" | |
source: https://gist.github.com/urschrei/5258588 by Stephan Hügel | |
2020 update: | |
- More iterators, fewer lists | |
- Python 3 compatible | |
- Processes files in parallel | |
(one thread per CPU, but that's not really how it works) | |
Ewald ( https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a ) | |
2020-09-20: | |
- handling of same filenames (write everything because contents might be different) | |
- handling of filenames with * | |
- handling of mkdir errors | |
- added arguments | |
2020-09-23 (v1.2) | |
- version_nr before extension | |
- error handling for utf-8 chars in eml (on error continue) | |
2020-10-02 (v1.3) | |
- now correctly handles RFC2047 MIME encoded filenames | |
2020-10-06 (v1.4) | |
- now handles multi-line filenames | |
- fixed handling of emails with no attachments | |
""" | |
import glob | |
import os | |
import email | |
import argparse | |
from multiprocessing import Pool | |
from cs.rfc2047 import unrfc2047 | |
EXTENSION = "eml" | |
parser = argparse.ArgumentParser(description='extract attachments from eml files') | |
parser.add_argument( | |
'-d','--debug', | |
action='store_true', | |
help='print debug messages to stderr' | |
) | |
parser.add_argument( | |
'-s','--single', | |
action='store_true', | |
help='run as single thread (default = multithreaded, one thread per core)' | |
) | |
parser.add_argument( | |
'-q','--quiet', | |
action='store_true', | |
help='no output' | |
) | |
args = parser.parse_args() | |
debug = args.debug | |
single = args.single | |
quiet = args.quiet | |
debug and print("debug output is active") | |
# ensure that an output dir exists | |
od = "attachments" | |
# the exist_ok=True avoids error messages due to us being multithreaded and race-conditions | |
# that should be no problem since we moved this out of the repetitive extract function | |
os.path.exists(od) or os.makedirs(od,exist_ok=True) | |
def extract(filename): | |
""" | |
Try to extract the attachments from filename | |
""" | |
debug and print("=> reading %s" % filename) | |
output_count = 0 | |
try: | |
with open(filename, "r") as f: | |
try: | |
msg = email.message_from_file(f) | |
nratt = len(msg.get_payload()) | |
# this will be 4000something if no attachments are present | |
if (nratt > 1 and nratt < 20): | |
for attachment in msg.get_payload()[1:]: | |
of = attachment.get_filename() | |
debug and print("attachment name: %s" % of) | |
# handle multi-line strings, and other problematic characters | |
of = of.replace("\n", "") | |
of = of.replace("\t", "_") | |
of = of.replace("\*", "#") | |
# this is to handle RFC2047 MIME encoded filenames (often used for obfuscation) | |
try: | |
output_filename = unrfc2047(of) | |
if ( of != output_filename): | |
debug and print("decoded attachment name: %s" % output_filename) | |
except Exception as inst: | |
print(type(inst)) # the exception instance | |
print(inst.args) # arguments stored in .args | |
print(inst) # __str__ allows args to be printed directly | |
# If no attachments are found, skip this file | |
if output_filename: | |
# check if this filename already exists | |
fn = od + "/" + output_filename | |
debug and print("checking existence of %s" % fn) | |
expand = 0 | |
if os.path.isfile(fn): | |
while True: | |
expand += 1 | |
# add the increment before the filename extension | |
fn_name, fn_ext = os.path.splitext(output_filename) | |
new_filename = fn_name + "_" + str(expand) + fn_ext | |
fn = od + "/" + new_filename | |
if os.path.isfile(fn): | |
continue | |
else: | |
output_filename = new_filename | |
break | |
not(quiet) and print("Writing %s " % output_filename) | |
with open(os.path.join(od, output_filename), "wb") as of: | |
of.write(attachment.get_payload(decode=True)) | |
output_count += 1 | |
if output_count == 0: | |
not(quiet) and print("No attachment found for file %s!" % f.name) | |
except Exception: | |
print('Fail: %s\n' % f) | |
# this should catch read and write errors | |
except IOError: | |
not(quiet) and print("Problem with %s or one of its attachments!" % f.name) | |
return 1, output_count | |
if __name__ == "__main__": | |
if not(single): | |
debug and print("running multithreaded") | |
# let's do this in parallel, using cpu count as number of threads | |
pool = Pool(None) | |
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION)) | |
# need these if we use _async | |
pool.close() | |
pool.join() | |
# 2-element list holding number of files, number of attachments | |
numfiles = [sum(i) for i in zip(*res)] | |
not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles)) | |
else: | |
filecnt = 0 | |
cnt = 0 | |
debug and print("running single threaded") | |
for file in glob.glob("*.%s" % EXTENSION): | |
filecnt += 1 | |
cnt += extract(file)[1] | |
not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt)) | |
Hi. Its quite useful code. Just small issue I have is, when there is an attachment like "Indoor Unit 03 Günlük Rapor_3_20210718_115911.csv" inside eml file, this code extracts it as "Indoor Unit 03 Günlük Rapor_3_20210718_115911.cs v". There is space between s and v at the extension. And when i print the of variable, I get the output as "=?iso-8859-3?Q?Indoor_Unit_03_G=FCnl=FCk_Rapor=5F3=5F20210718=5F115911.cs?=
=?iso-8859-3?Q?v?=". Can you please help with this?
Thanks.
This is a very well written script but I'm having issue's. Would you be able to point me in the right direction?
I'm currently testing with a folder that has 100 or so .eml files and each file has a .msg file attached within it.
I am getting the following error's for every email inside the folder as it iterates through.
Fail: <_io.TextIOWrapper name='00000463.eml' mode='r' encoding='cp1257'>
Fail: <_io.TextIOWrapper name='00000464.eml' mode='r' encoding='cp1257'>
Done: Processed 100 files with 0 attachments.
Any assistance would be amazing!
Regards,
-Julian
Hi @beamzer thanks for this wonderful code
Hi @julianjamespy ,
I had the same problem with .eml embedded in .eml.
For some shady reason, the parser is unable to process adequately .eml attachments.
But at least you can open it as a string or bytes, and you notice that there is kind of weird header like:
"Content-Type: message/rfc822
Content-Disposition: attachment;
creation-date="Fri, 03 Sep 2021 07:45:44 GMT";
modification-date="Fri, 03 Sep 2021 07:45:44 GMT""
So I tried to remove it, by simply removing the first 162 characters of attachments when the attachment is detected with NoneType:
attachment.as_bytes()[162:]
Also, I added a recursive call to extract, to extract the attachments in .eml embedded in .eml
Please find below the reviewed function (working on my side):
def extract(filename):
"""
Try to extract the attachments from all files in cwd
"""
# ensure that an output dir exists
od = "output"
os.path.exists(od) or os.makedirs(od)
output_count = 0
try:
with open(filename, "r") as f:
msg = email.message_from_file(f, policy=policy.default)
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue
# If no attachments are found, skip this file
if output_filename:
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)
#for EML embedded in EML
else:
output_filename = filename.split('.')[:-1][0].split('\\')[-1]+'_void_'+str(output_count)+'.eml'
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.as_bytes()[162:])
output_count += 1
extract(os.path.join(od, output_filename))
except TypeError:
print("Couldn't get payload for %s" % output_filename)
if output_count == 0:
print("No attachment found for file %s!" % f.name)
# this should catch read and write errors
except IOError:
print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
Cool fork!
Could you add an argument "-f filename" to extract just one eml file?
Best regards
Delgardo