Skip to content

Instantly share code, notes, and snippets.

@joscha
Forked from polo2ro/dump-imap.py
Last active May 16, 2025 10:10
Show Gist options
  • Save joscha/1b5c4065125433703d43c190783145c3 to your computer and use it in GitHub Desktop.
Save joscha/1b5c4065125433703d43c190783145c3 to your computer and use it in GitHub Desktop.
Simple script to dump an IMAP folder into eml files, store email using the message-id hash
export DIRENV_WARN_TIMEOUT=20s
eval "$(devenv direnvrc)"
# The use_devenv function supports passing flags to the devenv command
# For example: use devenv --impure --option services.postgres.enable:bool true
use devenv
# Devenv
.devenv*
devenv.local.nix
# direnv
.direnv
# pre-commit
.pre-commit-config.yaml
/emails/**/*.eml
/exec.sh
/emails.json
.DS_Store

Download emails from a (G)Mail imap box

Example

Get all emails sent to me for a German GMail account, reading the App password from 1Password.

 IMAP_PASSWORD="$(op read 'op://my/secret/ref/password')" \
    python \
    dump-imap.py \
    -r '"[Gmail]/Alle Nachrichten"' \
    -u "[email protected]" \
    -t "[email protected]"

Label-Studio

Sample labeling interface:

<View>
  <Header value="$subject"></Header>
  From: <Text name="from" value="$from"/>
  <Text name="text" value="$text"/>
  <List name="attachments" value="$attachments" title="Attachments" />
  <View style="box-shadow: 2px 2px 5px #999; padding: 20px; margin-top: 2em; border-radius: 5px;">
    <Header value="Choose whether this is a spam email or not"/>
    <Choices name="is-spam" toName="text" choice="single" showInLine="true">
    <Choice value="Yes"/><Choice value="No"/><Choice value="Unsure"/></Choices>
  </View>
</View><!-- {
    "id": "b363babfaf876150d862662e594b65265761e9d3e0f72dfaaf2ddcf4396fac4d",
    "data": {
      "subject": "Hello",
      "from": "Some One <[email protected]>",
      "date": "Sun, 26 Dec 2021 15:39:14 +0000",
      "text": "Dear Sir, please buy XXX.",
      "attachments": [
        { "id": "b363babf", "title": "flyer.pdf" }
      ]
    }
  } -->

Caveats

  • Reads/writes emails sequentially, not in parallel.
import glob
import email
import json
import html2text
import re
import chardet
import hashlib
from email import policy
from tqdm import tqdm
from pathlib import Path
def detect_encoding(file_path):
with open(file_path, "rb") as f:
raw_data = f.read(100000)
result = chardet.detect(raw_data)
return result["encoding"]
def clean_text(text):
if not text:
return ""
# Remove Unicode control characters
text = re.sub(r"[\u200E\u200F\u202A-\u202E\u2066-\u2069]", "", text)
# Replace invalid UTF-8 characters with their closest ASCII equivalent
text = text.encode("utf-8", errors="replace").decode("utf-8")
return text
def is_image_attachment(part):
# Check MIME type first
content_type = part.get_content_type()
if content_type.startswith("image/"):
return True
# Fallback to file extension check
filename = part.get_filename()
if filename:
ext = Path(filename).suffix.lower()
return ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]
return False
def get_attachment_info(part):
filename = part.get_filename()
if not filename:
return None
# Try to get Content-ID first
content_id = part.get("Content-ID", "")
if content_id:
# Remove < and > from Content-ID if present
content_id = content_id.strip("<>")
else:
# Fallback: create hash from filename
content_id = hashlib.sha256(filename.encode()).hexdigest()
return {"id": content_id, "title": clean_text(filename)}
def get_attachment_names(msg):
attachments = []
for part in msg.walk():
if part.get_content_maintype() == "multipart":
continue
if part.get("Content-Disposition") is None:
continue
if is_image_attachment(part):
continue
attachment_info = get_attachment_info(part)
if attachment_info:
attachments.append(attachment_info)
return attachments
tasks = []
email_files = glob.glob("emails/**/*.eml", recursive=True)
for path in tqdm(email_files, desc="Processing emails"):
# Get the basename without extension as the key
key = Path(path).stem
# Detect the file encoding
encoding = detect_encoding(path)
if not encoding:
encoding = "utf-8" # fallback to utf-8 if detection fails
with open(path, "r", encoding=encoding, errors="replace") as f:
msg = email.message_from_file(f, policy=policy.default)
# prefer plain text, fall back to HTML → text
body = msg.get_body(preferencelist=("plain",))
if body:
text = body.get_content()
else:
html = msg.get_body(preferencelist=("html",)).get_content()
text = html2text.html2text(html)
text = clean_text(text)
# Clean up subject and other fields as well
subject = clean_text(msg["Subject"] or "")
from_addr = clean_text(msg["From"] or "")
date = msg["Date"] or ""
attachments = get_attachment_names(msg)
tasks.append(
{
"id": key,
"data": {
"subject": subject,
"from": from_addr,
"date": date,
"text": text,
"attachments": attachments,
},
}
)
json.dump(
tasks, open("emails.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2
)
{
"nodes": {
"devenv": {
"locked": {
"dir": "src/modules",
"lastModified": 1747185494,
"owner": "cachix",
"repo": "devenv",
"rev": "b292bc94c2daccda165bc9f909bf6c8056e37a80",
"type": "github"
},
"original": {
"dir": "src/modules",
"owner": "cachix",
"repo": "devenv",
"type": "github"
}
},
"flake-compat": {
"flake": false,
"locked": {
"lastModified": 1747046372,
"owner": "edolstra",
"repo": "flake-compat",
"rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"flake-compat_2": {
"flake": false,
"locked": {
"lastModified": 1747046372,
"owner": "edolstra",
"repo": "flake-compat",
"rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"git-hooks": {
"inputs": {
"flake-compat": "flake-compat",
"gitignore": "gitignore",
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1746537231,
"owner": "cachix",
"repo": "git-hooks.nix",
"rev": "fa466640195d38ec97cf0493d6d6882bc4d14969",
"type": "github"
},
"original": {
"owner": "cachix",
"repo": "git-hooks.nix",
"type": "github"
}
},
"gitignore": {
"inputs": {
"nixpkgs": [
"git-hooks",
"nixpkgs"
]
},
"locked": {
"lastModified": 1709087332,
"owner": "hercules-ci",
"repo": "gitignore.nix",
"rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
"type": "github"
},
"original": {
"owner": "hercules-ci",
"repo": "gitignore.nix",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1746807397,
"owner": "cachix",
"repo": "devenv-nixpkgs",
"rev": "c5208b594838ea8e6cca5997fbf784b7cca1ca90",
"type": "github"
},
"original": {
"owner": "cachix",
"ref": "rolling",
"repo": "devenv-nixpkgs",
"type": "github"
}
},
"nixpkgs-python": {
"inputs": {
"flake-compat": "flake-compat_2",
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1746223523,
"owner": "cachix",
"repo": "nixpkgs-python",
"rev": "3f5f1dbe0122a1741907aa5ab76f7337ffcd2ccb",
"type": "github"
},
"original": {
"owner": "cachix",
"repo": "nixpkgs-python",
"type": "github"
}
},
"root": {
"inputs": {
"devenv": "devenv",
"git-hooks": "git-hooks",
"nixpkgs": "nixpkgs",
"nixpkgs-python": "nixpkgs-python",
"pre-commit-hooks": [
"git-hooks"
]
}
}
},
"root": "root",
"version": 7
}
{
pkgs,
lib,
config,
inputs,
...
}:
{
# https://devenv.sh/packages/
packages = [ pkgs.git ];
languages.python.enable = true;
languages.python.uv.enable = true;
languages.python.uv.sync.enable = true;
languages.python.uv.sync.allExtras = true;
languages.python.venv.enable = true;
languages.python.version = "3.12";
scripts.dump.exec = ''
python dump-imap.py
'';
git-hooks.hooks = {
ruff.enable = true;
ruff-format.enable = true;
check-toml.enable = true;
commitizen.enable = true;
nixfmt-rfc-style.enable = true;
# enable when https://github.com/cachix/git-hooks.nix/issues/584 is fixed
# trufflehog.enable = true;
};
# See full reference at https://devenv.sh/reference/options/
}
inputs:
nixpkgs:
url: github:cachix/devenv-nixpkgs/rolling
nixpkgs-python:
url: github:cachix/nixpkgs-python
inputs:
nixpkgs:
follows: nixpkgs
import hashlib
import imaplib
import argparse
import email
import os
import getpass
import socket
from contextlib import contextmanager
from pathlib import Path
from tqdm import tqdm
def get_password():
password = os.getenv("IMAP_PASSWORD")
if password is None:
password = getpass.getpass()
return password
def message_id_hash(msg):
mid = (msg["Message-Id"] or "").strip().lower().strip("<>")
h = hashlib.sha256(mid.encode()).hexdigest()
return h
@contextmanager
def imap_connection(host, username, password):
# Set a reasonable timeout for the connection
socket.setdefaulttimeout(30)
try:
mailbox = imaplib.IMAP4_SSL(host)
try:
mailbox.login(username, password)
yield mailbox
except imaplib.IMAP4.error as e:
if "AUTHENTICATIONFAILED" in str(e):
print(
"Authentication failed. Please check your credentials and ensure you're using the correct password."
)
print(
"If you're using a VPN, try disconnecting it or using a different VPN server."
)
raise
else:
print(f"IMAP error: {e}")
raise
except socket.timeout:
print(
"Connection timed out. This might be due to VPN issues or network problems."
)
raise
except Exception as e:
print(f"Unexpected error: {e}")
raise
finally:
try:
mailbox.close()
except: # noqa: E722
pass
try:
mailbox.logout()
except: # noqa: E722
pass
def parse_args():
"""
Parse command line arguments.
Returns:
argparse.Namespace: Parsed command line arguments
"""
argparser = argparse.ArgumentParser(
description="Dump a IMAP folder into .eml files"
)
argparser.add_argument(
"-s",
dest="host",
help="IMAP host, like imap.gmail.com",
default="imap.gmail.com",
)
argparser.add_argument("-u", dest="username", help="IMAP username", required=True)
argparser.add_argument(
"-p", dest="password", help="IMAP password", default=get_password()
)
argparser.add_argument(
"-r", dest="remote_folder", help="Remote folder to download", default="INBOX"
)
argparser.add_argument(
"-l",
dest="local_folder",
help="Local folder where to save .eml files",
default="./emails",
)
argparser.add_argument(
"-t",
dest="to_address",
help="Only download messages sent to this email address",
)
return argparser.parse_args()
def download_emails(
host,
username,
password,
remote_folder="INBOX",
local_folder="./emails",
to_address=None,
):
"""
Download emails from an IMAP server.
Args:
host (str): IMAP server hostname
username (str): IMAP username
password (str): IMAP password
remote_folder (str): Remote folder to download from
local_folder (str): Local folder to save emails to
to_address (str, optional): Only download messages sent to this email address
"""
with imap_connection(host, username, password) as mailbox:
mailbox.select(remote_folder, readonly=True)
# Use TO filter if to_address is provided, otherwise use ALL
search_criteria = f'(TO "{to_address}")' if to_address else "ALL"
status, data = mailbox.search(None, search_criteria)
email_ids = data[0].split()
tqdm.write(f"{len(email_ids)} emails found")
local_folder = Path(local_folder) / username
local_folder.mkdir(parents=True, exist_ok=True)
for email_id in tqdm(email_ids, desc="Downloading emails"):
status, data = mailbox.fetch(email_id, "(RFC822)")
raw_email = data[0][1]
msg = email.message_from_bytes(raw_email)
filename = message_id_hash(msg)
path = local_folder / ("%s.eml" % filename)
if os.path.exists(path):
tqdm.write("Skipping %s" % path)
continue
with open(path, "wb") as f:
tqdm.write("Writing %s" % path)
f.write(raw_email)
if __name__ == "__main__":
args = parse_args()
download_emails(
host=args.host,
username=args.username,
password=args.password,
remote_folder=args.remote_folder,
local_folder=args.local_folder,
to_address=args.to_address,
)
[project]
name = "dump-imap"
version = "0.1.0"
description = "Dump a IMAP folder into .eml files"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"chardet>=5.2.0",
"html2text>=2025.4.15",
"tqdm>=4.67.1",
]
version = 1
revision = 2
requires-python = ">=3.12"
[[package]]
name = "chardet"
version = "5.2.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload_time = "2023-08-01T19:23:02.662Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload_time = "2023-08-01T19:23:00.661Z" },
]
[[package]]
name = "colorama"
version = "0.4.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload_time = "2022-10-25T02:36:22.414Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload_time = "2022-10-25T02:36:20.889Z" },
]
[[package]]
name = "dump-imap"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "chardet" },
{ name = "html2text" },
{ name = "tqdm" },
]
[package.metadata]
requires-dist = [
{ name = "chardet", specifier = ">=5.2.0" },
{ name = "html2text", specifier = ">=2025.4.15" },
{ name = "tqdm", specifier = ">=4.67.1" },
]
[[package]]
name = "html2text"
version = "2025.4.15"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f8/27/e158d86ba1e82967cc2f790b0cb02030d4a8bef58e0c79a8590e9678107f/html2text-2025.4.15.tar.gz", hash = "sha256:948a645f8f0bc3abe7fd587019a2197a12436cd73d0d4908af95bfc8da337588", size = 64316, upload_time = "2025-04-15T04:02:30.045Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1d/84/1a0f9555fd5f2b1c924ff932d99b40a0f8a6b12f6dd625e2a47f415b00ea/html2text-2025.4.15-py3-none-any.whl", hash = "sha256:00569167ffdab3d7767a4cdf589b7f57e777a5ed28d12907d8c58769ec734acc", size = 34656, upload_time = "2025-04-15T04:02:28.44Z" },
]
[[package]]
name = "tqdm"
version = "4.67.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload_time = "2024-11-24T20:12:22.481Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload_time = "2024-11-24T20:12:19.698Z" },
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment