Skip to content

Instantly share code, notes, and snippets.

@WMAL
Last active August 23, 2025 18:05
Show Gist options
  • Save WMAL/3f3b440488e1589687be80fb67705be0 to your computer and use it in GitHub Desktop.
Save WMAL/3f3b440488e1589687be80fb67705be0 to your computer and use it in GitHub Desktop.
Converts .eml files to HTML, Markdown, PDF, plain text, and JSON formats. Handles attachments, counts replies, cleans up HTML, and manages duplicates. Outputs all processed emails into a single indexed HTML file, convertible to other formats.
# =========================================
# Email to PDF Conversion Script
# =========================================
#
# Version: 1.1
# Script written by Warith Al Maawali
# (c) 2025
#
# Discord channel: https://discord.gg/KEFErEx
# Twitter: http://twitter.com/warith2020
# Linkedin: http://www.linkedin.com/in/warith1977
# Website: https://www.digi77.com
#
# This script converts email files (.eml) to PDF format.
# It supports HTML, Markdown, and plain text content,
# preserves formatting, and extracts attachments.
#
# The resulting PDF includes the email body and
# information about any attachments present.
#
# This software is dual-licensed:
#
# Personal, non-commercial use: Apache License 2.0
# Commercial, corporate, or organizational use: Separate commercial license required.
# Contact me for licensing inquiries.
#
# Usage: python eml2pdf.py
# =========================================
# Changeable variables and sensitive words
BASE_DIRECTORY = "/xxxxxxx/Emails"
EML_DIRECTORY = "eml files"
HTML_OUTPUT_DIRECTORY = "html_output"
MARKDOWN_OUTPUT_DIRECTORY = "markdown_output"
PDF_OUTPUT_DIRECTORY = "pdf_output"
COMBINED_OUTPUT_DIRECTORY = "combined_output"
TEXT_OUTPUT_DIRECTORY = "text_output"
JSON_OUTPUT_DIRECTORY = "json_output"
ATTACHMENTS_DIRECTORY = "attachments"
COMBINED_OUTPUT_FILENAME = "combined_emails"
PRIVATE_MESSAGE_PATTERN = r'This is a PRIVATE message.*?purpose\.'
RESTRICTED_DATA_PATTERN = r'External xxxxxxxx correspondence:.*?if it is obtained from another source without restriction\.'
CONFIDENTIALITY_START = "CONFIDENTIALITY: This email and any accompa"
# Import necessary libraries
import os
import email
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, KeepTogether
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from email.utils import getaddresses, parsedate_to_datetime
from reportlab.pdfgen import canvas
from reportlab.platypus.flowables import Flowable
from html import unescape, escape
import re
import pkg_resources
import datetime
import json
import markdown
from tabulate import tabulate
from html5lib import HTMLParser, parse
import subprocess
import sys
from xhtml2pdf import pisa
import logging
import chardet
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
"""JSON serializer for objects not serializable by default json code"""
def json_serial(obj):
if isinstance(obj, datetime.datetime):
serial = obj.isoformat()
return serial
"""
Safe payload decoder that handles various encodings and errors gracefully
Args:
part: Email message part to decode
Returns:
str: Decoded text content or empty string if decoding fails
"""
def decode_payload(part):
try:
# Get the payload
payload = part.get_payload(decode=True)
# If payload is None, return empty string
if payload is None:
logger.warning("Empty payload encountered")
return ""
# Try to get charset from the part
charset = part.get_content_charset()
# If charset is specified, try to use it first
if charset:
try:
return payload.decode(charset)
except (UnicodeDecodeError, LookupError) as e:
logger.warning(f"Failed to decode with specified charset {charset}: {e}")
# Try common encodings in order of likelihood
encodings = ['utf-8', 'latin-1', 'windows-1252', 'iso-8859-1', 'ascii']
for encoding in encodings:
try:
return payload.decode(encoding)
except (UnicodeDecodeError, LookupError):
continue
# If all encodings fail, try to detect encoding using chardet
try:
detected = chardet.detect(payload)
if detected and detected.get('encoding'):
try:
return payload.decode(detected['encoding'])
except (UnicodeDecodeError, LookupError) as e:
logger.warning(f"Failed to decode with detected encoding {detected['encoding']}: {e}")
except Exception as e:
logger.warning(f"Charset detection failed: {e}")
# Last resort: decode with errors='ignore'
logger.warning("All decoding attempts failed, using UTF-8 with errors='ignore'")
return payload.decode('utf-8', errors='ignore')
except Exception as e:
logger.error(f"Unexpected error in decode_payload: {e}")
return ""
"""
Extract content from an email file (.eml)
Args:
eml_file (str): Path to the .eml file
Returns:
tuple: (date, html_content, subject, reply_count)
"""
def extract_email_content(eml_file):
try:
with open(eml_file, 'rb') as email_file:
email_message = BytesParser(policy=policy.default).parse(email_file)
except Exception as e:
logger.error(f"Failed to parse email file {eml_file}: {e}")
raise
# Extract header information
sender = email_message['From']
subject = email_message['Subject']
date = email_message['Date']
message_id = email_message['Message-ID']
# Extract attachments
attachments = []
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
if filename:
try:
payload = part.get_payload(decode=True)
if payload:
size = len(payload)
size_str = f"{size / 1024:.2f} KB" if size < 1024 * 1024 else f"{size / (1024 * 1024):.2f} MB"
attachments.append(f"{filename} ({size_str})")
# Extract and save attachment
attachments_directory = os.path.join(BASE_DIRECTORY, ATTACHMENTS_DIRECTORY)
if not os.path.exists(attachments_directory):
os.makedirs(attachments_directory)
filepath = os.path.join(attachments_directory, filename)
with open(filepath, 'wb') as f:
f.write(payload)
logger.info(f"Saved attachment: {filepath}")
except Exception as e:
logger.warning(f"Failed to extract attachment {filename}: {e}")
attachments.append(f"{filename} (extraction failed)")
content = []
private_message_pattern = re.compile(PRIVATE_MESSAGE_PATTERN, re.DOTALL)
restricted_data_pattern = re.compile(RESTRICTED_DATA_PATTERN, re.DOTALL)
# Process multipart emails
if email_message.is_multipart():
for part in email_message.walk():
try:
content_type = part.get_content_type()
if content_type == 'text/plain':
part_content = decode_payload(part)
if part_content:
part_content = private_message_pattern.sub('', part_content)
part_content = restricted_data_pattern.sub('', part_content)
content.append(part_content)
elif content_type == 'text/html':
html_content = decode_payload(part)
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
for img in soup.find_all('img'):
img.decompose()
for p in soup.find_all('p'):
if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START):
p.decompose()
plain_content = soup.get_text()
plain_content = private_message_pattern.sub('', plain_content)
plain_content = restricted_data_pattern.sub('', plain_content)
content.append(plain_content)
except Exception as e:
logger.warning(f"Error processing multipart email part: {e}")
continue
else:
# Process non-multipart emails
try:
content_type = email_message.get_content_type()
if content_type == 'text/plain':
email_content = decode_payload(email_message)
if email_content:
email_content = private_message_pattern.sub('', email_content)
email_content = restricted_data_pattern.sub('', email_content)
content.append(email_content)
elif content_type == 'text/html':
html_content = decode_payload(email_message)
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
for img in soup.find_all('img'):
img.decompose()
for p in soup.find_all('p'):
if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START):
p.decompose()
plain_content = soup.get_text()
plain_content = private_message_pattern.sub('', plain_content)
plain_content = restricted_data_pattern.sub('', plain_content)
content.append(plain_content)
except Exception as e:
logger.warning(f"Error processing non-multipart email: {e}")
# Create header in markdown format
header = "# Email Details\n\n"
header += f"| Field | Value |\n|-------|-------|\n"
header += f"| From | {sender} |\n"
header += f"| Subject | {subject} |\n"
header += f"| Date | {date} |\n"
header += f"| Message-ID | {message_id} |\n"
if attachments:
header += f"| Attachments | {', '.join(attachments)} |\n"
full_content = header + '\n\n## Email Body\n\n' + '\n'.join(content)
# Convert markdown to HTML with advanced styling
html_content = markdown.markdown(full_content, extensions=['tables', 'fenced_code', 'codehilite'])
"""Count the number of replies in an email body"""
def count_replies(body):
wrote_pattern = r"wrote:"
matches = re.findall(wrote_pattern, body, re.IGNORECASE)
return len(matches)
# Count replies
reply_count = count_replies('\n'.join(content))
return date, html_content, subject, reply_count
"""
Save email content as an HTML file with advanced styling
Args:
email_content (str): HTML content of the email
output_file (str): Path to save the HTML file
"""
def save_email_content(email_content, output_file):
# Add some advanced styling
styled_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Email Content</title>
<style>
body {{
font-family: Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}}
h1, h2, h3 {{
color: #2c3e50;
}}
table {{
border-collapse: collapse;
width: 100%;
margin-bottom: 20px;
}}
th, td {{
border: 1px solid #ddd;
padding: 12px;
text-align: left;
}}
tr:nth-child(even) {{
background-color: #f2f2f2;
}}
th {{
background-color: #3498db;
color: white;
}}
pre {{
background-color: #f8f8f8;
border: 1px solid #ddd;
border-radius: 3px;
padding: 10px;
overflow-x: auto;
}}
code {{
font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
font-size: 0.9em;
}}
blockquote {{
border-left: 4px solid #3498db;
padding-left: 15px;
color: #777;
font-style: italic;
}}
</style>
</head>
<body>
{email_content}
</body>
</html>
"""
# Validate HTML
parser = HTMLParser(strict=True)
dom = parse(styled_html)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(styled_html)
"""
Save all emails to a single HTML file with advanced styling and an index
Args:
email_data (list): List of tuples containing email data
output_file (str): Path to save the combined HTML file
"""
def save_all_emails_to_one_file(email_data, output_file):
# Add some advanced styling
combined_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Combined Emails</title>
<style>
body {{
font-family: Arial, sans-serif;
line-height: 1.6;
color: #333;
width: 100%;
margin: 0;
padding: 20px;
box-sizing: border-box;
}}
.container {{
max-width: 100%;
margin: 0 auto;
}}
h1, h2, h3 {{
color: #2c3e50;
}}
table {{
border-collapse: collapse;
width: 80%;
margin-bottom: 20px;
}}
th, td {{
border: 1px solid #ddd;
padding: 12px;
text-align: left;
}}
tr:nth-child(even) {{
background-color: #f2f2f2;
}}
th {{
background-color: #3498db;
color: white;
}}
pre {{
background-color: #f8f8f8;
border: 1px solid #ddd;
border-radius: 3px;
padding: 10px;
overflow-x: auto;
white-space: pre-wrap;
word-wrap: break-word;
}}
code {{
font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
font-size: 0.9em;
}}
blockquote {{
border-left: 4px solid #3498db;
padding-left: 15px;
color: #777;
font-style: italic;
}}
a {{
color: #3498db;
text-decoration: none;
}}
a:hover {{
text-decoration: underline;
}}
</style>
</head>
<body>
<h1>Email Index</h1>
<table>
<tr><th>#</th><th>Date</th><th>Subject</th><th>EML File</th><th>Replies</th></tr>
"""
for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1):
formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S')
combined_html += f"<tr><td>{idx}</td><td>{formatted_date}</td><td><a href='#email-{idx}'>{subject}</a></td><td>{filename}</td><td>{reply_count}</td></tr>\n"
combined_html += "</table><hr>\n"
for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1):
formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S')
combined_html += f"<h2 id='email-{idx}'>{idx}. Email from {formatted_date}</h2>\n<hr>\n{email_content}\n<br><br>\n"
combined_html += """
</body>
</html>
"""
# Validate HTML
parser = HTMLParser(strict=True)
dom = parse(combined_html)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(combined_html)
"""
Convert HTML file to Markdown using Pandoc
Args:
html_file (str): Path to the input HTML file
md_file (str): Path to save the output Markdown file
"""
def convert_to_markdown(html_file, md_file):
try:
subprocess.run(['pandoc', '-f', 'html', '-t', 'markdown', '-o', md_file, html_file], check=True)
print(f"Successfully converted {html_file} to {md_file}")
except subprocess.CalledProcessError as e:
print(f"Error converting {html_file} to Markdown: {e}")
except FileNotFoundError:
print("Error: Pandoc is not installed or not in the system PATH.")
print("Please install Pandoc and make sure it's accessible from the command line.")
print("You can install Pandoc using the following steps:")
print("1. For Ubuntu/Debian: sudo apt-get install pandoc")
print("2. For macOS with Homebrew: brew install pandoc")
print("3. For Windows: Download the installer from https://pandoc.org/installing.html")
print("After installation, restart your terminal or IDE to update the PATH.")
sys.exit(1)
"""
Convert HTML file to PDF using pisa (xhtml2pdf)
Args:
html_file (str): Path to the input HTML file
pdf_file (str): Path to save the output PDF file
"""
def convert_to_pdf(html_file, pdf_file):
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
with open(pdf_file, 'wb') as f:
pisa.CreatePDF(html_content, dest=f)
print(f"Successfully converted {html_file} to PDF: {pdf_file}")
"""
Convert HTML file to plain text
Args:
html_file (str): Path to the input HTML file
txt_file (str): Path to save the output text file
"""
def convert_to_text(html_file, txt_file):
with open(html_file, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
text_content = soup.get_text()
with open(txt_file, 'w', encoding='utf-8') as f:
f.write(text_content)
print(f"Successfully converted {html_file} to text: {txt_file}")
"""
Convert HTML file to JSON
Args:
html_file (str): Path to the input HTML file
json_file (str): Path to save the output JSON file
"""
def convert_to_json(html_file, json_file):
with open(html_file, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
data = {
'title': soup.title.string if soup.title else '',
'body': soup.body.get_text() if soup.body else ''
}
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"Successfully converted {html_file} to JSON: {json_file}")
if __name__ == "__main__":
eml_directory = os.path.join(BASE_DIRECTORY, EML_DIRECTORY)
html_output_directory = os.path.join(BASE_DIRECTORY, HTML_OUTPUT_DIRECTORY)
markdown_output_directory = os.path.join(BASE_DIRECTORY, MARKDOWN_OUTPUT_DIRECTORY)
pdf_output_directory = os.path.join(BASE_DIRECTORY, PDF_OUTPUT_DIRECTORY)
combined_output_directory = os.path.join(BASE_DIRECTORY, COMBINED_OUTPUT_DIRECTORY)
text_output_directory = os.path.join(BASE_DIRECTORY, TEXT_OUTPUT_DIRECTORY)
json_output_directory = os.path.join(BASE_DIRECTORY, JSON_OUTPUT_DIRECTORY)
# Create output directories if they don't exist
for directory in [html_output_directory, markdown_output_directory, pdf_output_directory,
combined_output_directory, text_output_directory, json_output_directory]:
os.makedirs(directory, exist_ok=True)
combined_output_file = os.path.join(combined_output_directory, "combined_emails.html")
combined_md_file = os.path.join(combined_output_directory, "combined_emails.md")
combined_pdf_file = os.path.join(combined_output_directory, "combined_emails.pdf")
combined_txt_file = os.path.join(combined_output_directory, "combined_emails.txt")
combined_json_file = os.path.join(combined_output_directory, "combined_emails.json")
email_data = []
failed_emails = []
for filename in os.listdir(eml_directory):
if filename.endswith(".eml"):
eml_file = os.path.join(eml_directory, filename)
try:
logger.info(f"Processing {filename}")
date, email_content, subject, reply_count = extract_email_content(eml_file)
# Only add to email_data if we got valid content
if email_content and date and subject is not None:
email_data.append((date, filename, email_content, subject, reply_count))
output_file = os.path.join(html_output_directory, os.path.splitext(filename)[0] + ".html")
save_email_content(email_content, output_file)
print(f"Saved content of {filename} to {output_file}")
else:
logger.warning(f"Skipping {filename} due to missing content")
failed_emails.append(filename)
except Exception as e:
logger.error(f"Failed to process {filename}: {e}")
failed_emails.append(filename)
continue
# Report any failed emails
if failed_emails:
print(f"\nWarning: Failed to process {len(failed_emails)} email(s):")
for failed in failed_emails:
print(f" - {failed}")
print()
# Sort emails by date
email_data.sort(key=lambda x: email.utils.parsedate_to_datetime(x[0]))
save_all_emails_to_one_file(email_data, combined_output_file)
print(f"Saved combined content to {combined_output_file}")
# Convert HTML files to other formats
for filename in os.listdir(html_output_directory):
if filename.endswith(".html"):
html_file = os.path.join(html_output_directory, filename)
base_name = os.path.splitext(filename)[0]
md_file = os.path.join(markdown_output_directory, base_name + ".md")
convert_to_markdown(html_file, md_file)
pdf_file = os.path.join(pdf_output_directory, base_name + ".pdf")
convert_to_pdf(html_file, pdf_file)
txt_file = os.path.join(text_output_directory, base_name + ".txt")
convert_to_text(html_file, txt_file)
json_file = os.path.join(json_output_directory, base_name + ".json")
convert_to_json(html_file, json_file)
# Convert combined HTML file to other formats
convert_to_markdown(combined_output_file, combined_md_file)
print(f"Converted combined HTML to Markdown: {combined_md_file}")
convert_to_pdf(combined_output_file, combined_pdf_file)
print(f"Converted combined HTML to PDF: {combined_pdf_file}")
convert_to_text(combined_output_file, combined_txt_file)
print(f"Converted combined HTML to text: {combined_txt_file}")
convert_to_json(combined_output_file, combined_json_file)
print(f"Converted combined HTML to JSON: {combined_json_file}")
@jokroese
Copy link

Hey, this is great! I was wondering what license it's under? I'd love to adapt it for converting my emails.

@WMAL
Copy link
Author

WMAL commented Oct 27, 2024

Hi,

This software is dual-licensed:

  • Personal, non-commercial use: Apache License 2.0
  • Commercial, corporate, or organizational use: Separate commercial license required.
    Contact [Your Contact Information] for licensing inquiries.

So feel free to use for personal use.

@jokroese
Copy link

Briliant! Thanks for your quick response :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment