-
-
Save urschrei/5258588 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
""" | |
2020 update: | |
- More iterators, fewer lists | |
- Python 3 compatible | |
- Processes files in parallel | |
(one thread per CPU, but that's not really how it works) | |
""" | |
import glob | |
import os | |
import email | |
from email import policy | |
from multiprocessing import Pool | |
EXTENSION = "eml" | |
def extract(filename): | |
""" | |
Try to extract the attachments from all files in cwd | |
""" | |
# ensure that an output dir exists | |
od = "output" | |
os.path.exists(od) or os.makedirs(od) | |
output_count = 0 | |
try: | |
with open(filename, "r") as f: | |
msg = email.message_from_file(f, policy=policy.default) | |
for attachment in msg.iter_attachments(): | |
try: | |
output_filename = attachment.get_filename() | |
except AttributeError: | |
print("Got string instead of filename for %s. Skipping." % f.name) | |
continue | |
# If no attachments are found, skip this file | |
if output_filename: | |
with open(os.path.join(od, output_filename), "wb") as of: | |
try: | |
of.write(attachment.get_payload(decode=True)) | |
output_count += 1 | |
except TypeError: | |
print("Couldn't get payload for %s" % output_filename) | |
if output_count == 0: | |
print("No attachment found for file %s!" % f.name) | |
# this should catch read and write errors | |
except IOError: | |
print("Problem with %s or one of its attachments!" % f.name) | |
return 1, output_count | |
if __name__ == "__main__": | |
# let's do this in parallel, using cpu count as number of threads | |
pool = Pool(None) | |
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION)) | |
# need these if we use _async | |
pool.close() | |
pool.join() | |
# 2-element list holding number of files, number of attachments | |
numfiles = [sum(i) for i in zip(*res)] | |
print("Done: Processed {} files with {} attachments.".format(*numfiles)) |
Traceback (most recent call last): File "../Software/parseml.py", line 51, in <module> res = pool.map(extract, glob.iglob("*.%s" % EXTENSION)) File "/usr/lib/python2.7/multiprocessing/pool.py", line 253, in map return self.map_async(func, iterable, chunksize).get() File "/usr/lib/python2.7/multiprocessing/pool.py", line 572, in get raise self._value AttributeError: 'str' object has no attribute 'get_filename'
Something tells me this is not what should happen...
Hi Alexander,
Did you try to run with Python3? Run only def?
Hi Alexander,
Did you try to run with Python3? Run only def?
With Python3 it's:
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/usr/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "../Software/parseml.py", line 31, in extract
output_filename = attachment.get_filename()
AttributeError: 'str' object has no attribute 'get_filename'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "../Software/parseml.py", line 48, in <module>
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
File "/usr/lib/python3.8/multiprocessing/pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/lib/python3.8/multiprocessing/pool.py", line 768, in get
raise self._value
AttributeError: 'str' object has no attribute 'get_filename'
And the threads stay hanging until you kill them with Ctrl+C.
I tried directly passing input file name to extract function, but it still complains about strings not having "get_filename" attribute.
Hi Alexander,
Did you try to run with Python3? Run only def?With Python3 it's:
multiprocessing.pool.RemoteTraceback: """ Traceback (most recent call last): File "/usr/lib/python3.8/multiprocessing/pool.py", line 125, in worker result = (True, func(*args, **kwds)) File "/usr/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar return list(map(*args)) File "../Software/parseml.py", line 31, in extract output_filename = attachment.get_filename() AttributeError: 'str' object has no attribute 'get_filename' """ The above exception was the direct cause of the following exception: Traceback (most recent call last): File "../Software/parseml.py", line 48, in <module> res = pool.map(extract, glob.iglob("*.%s" % EXTENSION)) File "/usr/lib/python3.8/multiprocessing/pool.py", line 364, in map return self._map_async(func, iterable, mapstar, chunksize).get() File "/usr/lib/python3.8/multiprocessing/pool.py", line 768, in get raise self._value AttributeError: 'str' object has no attribute 'get_filename'
And the threads stay hanging until you kill them with Ctrl+C.
I tried directly passing input file name to extract function, but it still complains about strings not having "get_filename" attribute.
I took Stephan's version and added the option to run single threaded, see:
https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a
you could give that a try and see if it works for you.
You saved my day <3
This is awesome! Exactly what I was looking for, thank you!
Hello @urschrei I would like to revise this line (31-32)
msg = email.message_from_file(f, policy=policy.default)
for attachment in msg.iter_attachments():
And add this import
from email import policy
This will solve this error AttributeError: 'str' object has no attribute 'get_filename'
in the newer python version
Source
@azlkiniue Done!
This is awesome, thanks for sharing! I'm building an Email pipeline in AWS, and I got to the part when it receives and puts the Email into S3, I needed a way to get the attachments out of the .eml files, and this script just does that.
Thanks for sharing! A real time saver and a generous act on your part.
It processes only a part of the files. Error:
`Traceback (most recent call last):
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 48, in mapstar
return list(map(*args))
File "C:\Users\mike\Documents\emailmails\email\emlextr.py", line 38, in extract
of.write(attachment.get_payload(decode=True))
TypeError: a bytes-like object is required, not 'NoneType'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\mike\Documents\emailmails\email\emlextr.py", line 51, in
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 771, in get
raise self._value
TypeError: a bytes-like object is required, not 'NoneType'`
How to debug?
real time saver! thanks for sharing
Works like a charm!
Is there anyway i can provide the output folder name and the source .eml file name from the command line?
PS: Sorry completely new to python
Works like a charm!
Thank you :)
Nice, thanks!
Hi @urschrei thank you very much !! very useful
Hi @e-tplus
I had the same problem which happened with .eml embedded in .eml.
For some shady reason, the parser is unable to process adequately .eml attachments.
But at least you can open it as a string or bytes, and you notice that there is kind of weird header like:
"Content-Type: message/rfc822
Content-Disposition: attachment;
creation-date="Fri, 03 Sep 2021 07:45:44 GMT";
modification-date="Fri, 03 Sep 2021 07:45:44 GMT""
So I tried to remove it, by simply removing the first 162 characters of attachments when the attachment is detected with NoneType:
attachment.as_bytes()[162:]
Also, I added a recursive call to extract, to extract the attachments in .eml embedded in .eml
Please find below the reviewed function (working on my side):
def extract(filename):
"""
Try to extract the attachments from all files in cwd
"""
# ensure that an output dir exists
od = "output"
os.path.exists(od) or os.makedirs(od)
output_count = 0
try:
with open(filename, "r") as f:
msg = email.message_from_file(f, policy=policy.default)
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue
# If no attachments are found, skip this file
if output_filename:
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)
**#for EML embedded in EML
else:
output_filename = filename.split('.')[:-1][0].split('\\')[-1]+'_void_'+str(output_count)+'.eml'
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.as_bytes()[162:])
output_count += 1
extract(os.path.join(od, output_filename))
except TypeError:
print("Couldn't get payload for %s" % output_filename)**
if output_count == 0:
print("No attachment found for file %s!" % f.name)
# this should catch read and write errors
except IOError:
print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
only works with python3
got this error when running using python
ImportError: cannot import name policy
Thankyou so much this helped me process all those realestate emails and get the rent receipts thankyou
It fails when there are files with the same name and ext. for this I added the date of the email eg. "15 Apr 2021".
if output_filename:
with open(os.path.join(od, str(msg['date'])[5:16] + " " + output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)
EXTREMELY useful, much thanks!
@urschrei
Many thanks.
But I find this code cannot extract .msg files which are from outlook app. When I attach one eml and one msg, only eml file is saved.
Worked great! Thank you for your hard work
Didn't work for me for embedded eml
inside eml
, with pdfs
in, this did:
(Note that it only seeks out pdf
files)
#!/usr/bin/python3
import os
import sys
import email
from email import policy
from email.parser import BytesParser
from email.iterators import typed_subpart_iterator
def extract_attachments(email_message, output_folder):
for part in typed_subpart_iterator(email_message, 'application', 'pdf'):
filename = part.get_filename()
if not filename:
continue
filepath = os.path.join(output_folder, filename)
with open(filepath, 'wb') as f:
f.write(part.get_payload(decode=True))
def parse_email(file_path, output_folder):
with open(file_path, 'rb') as f:
msg = BytesParser(policy=policy.default).parse(f)
if msg.is_multipart():
for payload in msg.iter_parts():
if payload.get_content_type() == 'message/rfc822':
extract_attachments(payload.get_payload(0), output_folder)
elif payload.get_content_type() == 'application/pdf':
extract_attachments(msg, output_folder)
else:
extract_attachments(msg, output_folder)
if __name__ == "__main__":
file_path = sys.argv[1]
output_folder = sys.argv[2]
os.makedirs(output_folder, exist_ok=True)
parse_email(file_path, output_folder)
First argument is the .eml
and second is where you want the files extracted.
I wanted to take a moment to express my gratitude for the Python code you wrote for extracting attachments from EML files. Your solution has been incredibly helpful and efficient for my needs. The way you handled the EML format and attachment extraction is impressive and much appreciated.
Thank you so much for your time and effort. It’s clear that a lot of thought went into creating this code, and I’m genuinely grateful for your contribution.
Just worked. Thank you ! (Probably GitHub should add a comment box to repositories as well. Sometimes you just want to say thanks and there is no simple way to do it.)
Something tells me this is not what should happen...