Skip to content

Instantly share code, notes, and snippets.

@urschrei
Last active June 12, 2025 08:54
Show Gist options
  • Save urschrei/5258588 to your computer and use it in GitHub Desktop.
Save urschrei/5258588 to your computer and use it in GitHub Desktop.
Extract attachments from EML files in the current dir, and write them to the output subdir. Now with recursion and robust filename handling
#!/usr/bin/env python3
"""
2025 update:
- Recursive extraction from nested EML files
- Robust filename handling with sanitization and deduplication
- Proper logging instead of print statements
- Enhanced error handling and validation
- Binary file reading for better encoding support
- Cross-platform filename compatibility
- Depth-limited recursion to prevent infinite loops
"""
import email
import glob
import logging
import os
import re
import sys
import unicodedata
from collections import defaultdict
from email import policy
from multiprocessing import Pool
from pathlib import Path
EXTENSION = "eml"
MAX_FILENAME_LENGTH = 255
INVALID_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
# Configure logging
def setup_logging(level=logging.INFO):
"""Setup logging configuration."""
logging.basicConfig(
level=level,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler("eml_extractor.log"),
],
)
return logging.getLogger(__name__)
def sanitize_filename(filename):
"""
Sanitize filename for cross-platform compatibility.
Handles newlines, invalid characters, and length limits.
"""
if not filename:
return "unnamed_attachment"
# Normalize unicode characters
filename = unicodedata.normalize("NFKD", filename)
# Remove or replace invalid characters
filename = re.sub(INVALID_CHARS, "_", filename)
filename = re.sub(r"\s+", " ", filename) # Normalize whitespace
filename = filename.strip(" .") # Remove leading/trailing spaces and dots
# Handle length limit
if len(filename) > MAX_FILENAME_LENGTH:
name, ext = os.path.splitext(filename)
max_name_len = MAX_FILENAME_LENGTH - len(ext)
filename = name[:max_name_len] + ext
return filename or "unnamed_attachment"
def get_unique_filename(output_dir, filename, file_counter):
"""
Generate unique filename to avoid overwrites.
Uses counter dict to track duplicates per original name.
"""
base_name = sanitize_filename(filename)
if base_name not in file_counter:
file_counter[base_name] = 0
return os.path.join(output_dir, base_name)
file_counter[base_name] += 1
name, ext = os.path.splitext(base_name)
unique_name = f"{name}_{file_counter[base_name]}{ext}"
return os.path.join(output_dir, unique_name)
def extract_attachments_recursive(msg, output_dir, file_counter, depth=0, max_depth=10):
"""
Recursively extract attachments, including from nested EML files.
Args:
msg: Email message object
output_dir: Output directory path
file_counter: Dict tracking filename duplicates
depth: Current recursion depth
max_depth: Maximum recursion depth to prevent infinite loops
Returns:
int: Number of attachments extracted
"""
logger = logging.getLogger(__name__)
if depth > max_depth:
logger.warning(
f"Maximum recursion depth ({max_depth}) reached, skipping further nesting"
)
return 0
attachment_count = 0
for part in msg.iter_attachments():
try:
filename = part.get_filename()
content_type = part.get_content_type()
if not filename:
# Generate filename based on content type
ext_map = {
"text/plain": ".txt",
"text/html": ".html",
"image/jpeg": ".jpg",
"image/png": ".png",
"application/pdf": ".pdf",
}
ext = ext_map.get(content_type, ".bin")
filename = f"attachment_{attachment_count + 1}{ext}"
output_path = get_unique_filename(output_dir, filename, file_counter)
try:
payload = part.get_payload(decode=True)
if payload is None:
logger.warning(f"Empty payload for {filename}")
continue
with open(output_path, "wb") as of:
of.write(payload)
attachment_count += 1
logger.info(
f"{' ' * depth}Extracted: {os.path.basename(output_path)}"
)
# Check if this is a nested EML file and recurse
if filename.lower().endswith(".eml") or content_type in [
"message/rfc822",
"text/plain",
]:
try:
# Try to parse as email message
nested_msg = email.message_from_bytes(
payload, policy=policy.default
)
if nested_msg.get("Message-ID") or nested_msg.get("From"):
logger.info(
f"{' ' * depth}Processing nested EML: {os.path.basename(output_path)}"
)
nested_count = extract_attachments_recursive(
nested_msg,
output_dir,
file_counter,
depth + 1,
max_depth,
)
attachment_count += nested_count
except Exception as e:
# Not a valid email message, continue normally
logger.debug(f"File {filename} not a valid nested email: {e}")
except (TypeError, OSError) as e:
logger.error(f"Error extracting {filename}: {e}")
except Exception as e:
logger.error(f"Error processing attachment: {e}")
return attachment_count
def extract(filename):
"""
Extract attachments from an EML file with recursive processing.
Returns:
tuple: (files_processed, attachments_extracted)
"""
logger = logging.getLogger(__name__)
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)
file_counter = defaultdict(int)
try:
with open(filename, "rb") as f: # Read as binary for better encoding handling
msg = email.message_from_bytes(f.read(), policy=policy.default)
logger.info(f"Processing: {filename}")
attachment_count = extract_attachments_recursive(
msg, str(output_dir), file_counter
)
if attachment_count == 0:
logger.info(f"No attachments found in {filename}")
else:
logger.info(
f"Extracted {attachment_count} attachment(s) from {filename}"
)
return 1, attachment_count
except Exception as e:
logger.error(f"Error processing {filename}: {e}")
return 1, 0
def main():
"""Main function with better error handling and progress tracking."""
logger = setup_logging()
eml_files = list(glob.glob(f"*.{EXTENSION}"))
if not eml_files:
logger.warning(f"No .{EXTENSION} files found in current directory")
return
logger.info(f"Found {len(eml_files)} EML file(s) to process")
try:
# Process files in parallel
with Pool() as pool:
results = pool.map(extract, eml_files)
# Calculate totals
total_files, total_attachments = map(sum, zip(*results))
logger.info(
f"Summary: Files processed: {total_files}, Attachments extracted: {total_attachments}"
)
except KeyboardInterrupt:
logger.info("Processing interrupted by user")
except Exception as e:
logger.error(f"Error during processing: {e}")
if __name__ == "__main__":
main()
@sosmii
Copy link

sosmii commented Jun 30, 2020

You are my savior

@timotgl
Copy link

timotgl commented Aug 19, 2020

Works as advertised, many thanks 👍 👍 👍

@nonamephysics
Copy link

Many thanks.
Add simple replace proc to avoid error in the case, when attachment file contains In the filename some like '\n'.
Maybe it makes sense add checks and replace.

@bolbatav
Copy link

bolbatav commented Sep 8, 2020

Traceback (most recent call last):
  File "../Software/parseml.py", line 51, in <module>
    res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 253, in map
    return self.map_async(func, iterable, chunksize).get()
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 572, in get
    raise self._value
AttributeError: 'str' object has no attribute 'get_filename'

Something tells me this is not what should happen...

@nonamephysics
Copy link

Traceback (most recent call last):
  File "../Software/parseml.py", line 51, in <module>
    res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 253, in map
    return self.map_async(func, iterable, chunksize).get()
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 572, in get
    raise self._value
AttributeError: 'str' object has no attribute 'get_filename'

Something tells me this is not what should happen...

Hi Alexander,
Did you try to run with Python3? Run only def?

@bolbatav
Copy link

bolbatav commented Sep 8, 2020

Hi Alexander,
Did you try to run with Python3? Run only def?

With Python3 it's:

multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "../Software/parseml.py", line 31, in extract
    output_filename = attachment.get_filename()
AttributeError: 'str' object has no attribute 'get_filename'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "../Software/parseml.py", line 48, in <module>
    res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 364, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 768, in get
    raise self._value
AttributeError: 'str' object has no attribute 'get_filename'

And the threads stay hanging until you kill them with Ctrl+C.
I tried directly passing input file name to extract function, but it still complains about strings not having "get_filename" attribute.

@beamzer
Copy link

beamzer commented Sep 20, 2020

Hi Alexander,
Did you try to run with Python3? Run only def?

With Python3 it's:

multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "../Software/parseml.py", line 31, in extract
    output_filename = attachment.get_filename()
AttributeError: 'str' object has no attribute 'get_filename'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "../Software/parseml.py", line 48, in <module>
    res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 364, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 768, in get
    raise self._value
AttributeError: 'str' object has no attribute 'get_filename'

And the threads stay hanging until you kill them with Ctrl+C.
I tried directly passing input file name to extract function, but it still complains about strings not having "get_filename" attribute.

I took Stephan's version and added the option to run single threaded, see:
https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a
you could give that a try and see if it works for you.

@d-weeteling
Copy link

You saved my day <3

@nickpieper
Copy link

This is awesome! Exactly what I was looking for, thank you!

@azlkiniue
Copy link

azlkiniue commented Mar 18, 2021

Hello @urschrei I would like to revise this line (31-32)

msg = email.message_from_file(f, policy=policy.default)
for attachment in msg.iter_attachments():

And add this import
from email import policy

This will solve this error AttributeError: 'str' object has no attribute 'get_filename' in the newer python version
Source

@urschrei
Copy link
Author

@azlkiniue Done!

@mikloslorinczi
Copy link

This is awesome, thanks for sharing! I'm building an Email pipeline in AWS, and I got to the part when it receives and puts the Email into S3, I needed a way to get the attachments out of the .eml files, and this script just does that.

@arnonzamir
Copy link

Thanks for sharing! A real time saver and a generous act on your part.

@e-tplus
Copy link

e-tplus commented Sep 16, 2021

It processes only a part of the files. Error:

`Traceback (most recent call last):
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 48, in mapstar
return list(map(*args))
File "C:\Users\mike\Documents\emailmails\email\emlextr.py", line 38, in extract
of.write(attachment.get_payload(decode=True))
TypeError: a bytes-like object is required, not 'NoneType'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "C:\Users\mike\Documents\emailmails\email\emlextr.py", line 51, in
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\mike\AppData\Local\Programs\Python\Python39\lib\multiprocessing\pool.py", line 771, in get
raise self._value
TypeError: a bytes-like object is required, not 'NoneType'`

How to debug?

@JulioLaral
Copy link

real time saver! thanks for sharing

@soubhikchatterjee
Copy link

Works like a charm!

Is there anyway i can provide the output folder name and the source .eml file name from the command line?

PS: Sorry completely new to python

@id82
Copy link

id82 commented Jan 31, 2022

Works like a charm!
Thank you :)

@Zylatis
Copy link

Zylatis commented Mar 6, 2022

Nice, thanks!

@xiaolongbao-dimsum
Copy link

Hi @urschrei thank you very much !! very useful

Hi @e-tplus

I had the same problem which happened with .eml embedded in .eml.
For some shady reason, the parser is unable to process adequately .eml attachments.

But at least you can open it as a string or bytes, and you notice that there is kind of weird header like:
"Content-Type: message/rfc822
Content-Disposition: attachment;
creation-date="Fri, 03 Sep 2021 07:45:44 GMT";
modification-date="Fri, 03 Sep 2021 07:45:44 GMT""

So I tried to remove it, by simply removing the first 162 characters of attachments when the attachment is detected with NoneType:
attachment.as_bytes()[162:]

Also, I added a recursive call to extract, to extract the attachments in .eml embedded in .eml

Please find below the reviewed function (working on my side):

def extract(filename):
    """
    Try to extract the attachments from all files in cwd
    """
    # ensure that an output dir exists
    od = "output"
    os.path.exists(od) or os.makedirs(od)
    output_count = 0
    try:
        with open(filename, "r") as f:
            msg = email.message_from_file(f, policy=policy.default)
            for attachment in msg.iter_attachments():
                try:
                    output_filename = attachment.get_filename()
                except AttributeError:
                    print("Got string instead of filename for %s. Skipping." % f.name)
                    continue
                # If no attachments are found, skip this file
                if output_filename:
                    with open(os.path.join(od, output_filename), "wb") as of:
                        try:
                            of.write(attachment.get_payload(decode=True))
                            output_count += 1
                        except TypeError:
                            print("Couldn't get payload for %s" % output_filename)
                **#for EML embedded in EML
                else:
                    output_filename = filename.split('.')[:-1][0].split('\\')[-1]+'_void_'+str(output_count)+'.eml'
                    with open(os.path.join(od, output_filename), "wb") as of:
                        try:
                            of.write(attachment.as_bytes()[162:])
                            output_count += 1
                            extract(os.path.join(od, output_filename))
                        except TypeError:
                            print("Couldn't get payload for %s" % output_filename)**
            if output_count == 0:
                print("No attachment found for file %s!" % f.name)
    # this should catch read and write errors
    except IOError:
        print("Problem with %s or one of its attachments!" % f.name)
    return 1, output_count

@farzher
Copy link

farzher commented Apr 16, 2022

only works with python3

got this error when running using python ImportError: cannot import name policy

@HUrquhart
Copy link

Thankyou so much this helped me process all those realestate emails and get the rent receipts thankyou

@maldunate
Copy link

It fails when there are files with the same name and ext. for this I added the date of the email eg. "15 Apr 2021".

if output_filename:
with open(os.path.join(od, str(msg['date'])[5:16] + " " + output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)

@dharley-gaggle
Copy link

EXTREMELY useful, much thanks!

@jonathanyaod3
Copy link

@urschrei
Many thanks.
But I find this code cannot extract .msg files which are from outlook app. When I attach one eml and one msg, only eml file is saved.

@kucster
Copy link

kucster commented Apr 24, 2023

Worked great! Thank you for your hard work

@dagelf
Copy link

dagelf commented Oct 3, 2023

Didn't work for me for embedded eml inside eml, with pdfs in, this did:
(Note that it only seeks out pdf files)

#!/usr/bin/python3
import os
import sys
import email
from email import policy
from email.parser import BytesParser
from email.iterators import typed_subpart_iterator

def extract_attachments(email_message, output_folder):
    for part in typed_subpart_iterator(email_message, 'application', 'pdf'):
        filename = part.get_filename()
        if not filename:
            continue
        filepath = os.path.join(output_folder, filename)
        with open(filepath, 'wb') as f:
            f.write(part.get_payload(decode=True))

def parse_email(file_path, output_folder):
    with open(file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
        if msg.is_multipart():
            for payload in msg.iter_parts():
                if payload.get_content_type() == 'message/rfc822':
                    extract_attachments(payload.get_payload(0), output_folder)
                elif payload.get_content_type() == 'application/pdf':
                    extract_attachments(msg, output_folder)
        else:
            extract_attachments(msg, output_folder)

if __name__ == "__main__":
    file_path = sys.argv[1]
    output_folder = sys.argv[2]
    os.makedirs(output_folder, exist_ok=True)
    parse_email(file_path, output_folder)

First argument is the .eml and second is where you want the files extracted.

@rajjana
Copy link

rajjana commented Aug 18, 2024

I wanted to take a moment to express my gratitude for the Python code you wrote for extracting attachments from EML files. Your solution has been incredibly helpful and efficient for my needs. The way you handled the EML format and attachment extraction is impressive and much appreciated.

Thank you so much for your time and effort. It’s clear that a lot of thought went into creating this code, and I’m genuinely grateful for your contribution.

@jjkavalam
Copy link

Just worked. Thank you ! (Probably GitHub should add a comment box to repositories as well. Sometimes you just want to say thanks and there is no simple way to do it.)

@abiank
Copy link

abiank commented Jan 15, 2025

Saved half of my morning, thank you

@scivision
Copy link

See https://gist.github.com/scivision/12d4177b743fafc9e5ff37d14bd44e8d for a single .eml file--one could loop this for separation of .eml file looping from extraction loop

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment