-
-
Save io41/87bfd05f7cc6704f23ef9eaad2e5b67b to your computer and use it in GitHub Desktop.
Converts .eml files to HTML, Markdown, PDF, plain text, and JSON formats. Handles attachments, counts replies, cleans up HTML, and manages duplicates. Outputs all processed emails into a single indexed HTML file, convertible to other formats.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ========================================= | |
# Email to PDF Conversion Script | |
# ========================================= | |
# | |
# Version: 1.0 | |
# Script written by Warith Al Maawali | |
# (c) 2024 | |
# | |
# Discord channel: https://discord.gg/KEFErEx | |
# Twitter: http://twitter.com/warith2020 | |
# Linkedin: http://www.linkedin.com/in/warith1977 | |
# Website: https://www.digi77.com | |
# | |
# This script converts email files (.eml) to PDF format. | |
# It supports HTML, Markdown, and plain text content, | |
# preserves formatting, and extracts attachments. | |
# | |
# The resulting PDF includes the email body and | |
# information about any attachments present. | |
# | |
# This software is dual-licensed: | |
# | |
# Personal, non-commercial use: Apache License 2.0 | |
# Commercial, corporate, or organizational use: Separate commercial license required. | |
# Contact me for licensing inquiries. | |
# | |
# Usage: python eml2pdf.py | |
# ========================================= | |
# Changeable variables and sensitive words | |
BASE_DIRECTORY = "/xxxxxxx/Emails" | |
EML_DIRECTORY = "eml files" | |
HTML_OUTPUT_DIRECTORY = "html_output" | |
MARKDOWN_OUTPUT_DIRECTORY = "markdown_output" | |
PDF_OUTPUT_DIRECTORY = "pdf_output" | |
COMBINED_OUTPUT_DIRECTORY = "combined_output" | |
TEXT_OUTPUT_DIRECTORY = "text_output" | |
JSON_OUTPUT_DIRECTORY = "json_output" | |
ATTACHMENTS_DIRECTORY = "attachments" | |
COMBINED_OUTPUT_FILENAME = "combined_emails" | |
PRIVATE_MESSAGE_PATTERN = r'This is a PRIVATE message.*?purpose\.' | |
RESTRICTED_DATA_PATTERN = r'External xxxxxxxx correspondence:.*?if it is obtained from another source without restriction\.' | |
CONFIDENTIALITY_START = "CONFIDENTIALITY: This email and any accompa" | |
# Import necessary libraries | |
import os | |
import email | |
from email import policy | |
from email.parser import BytesParser | |
from bs4 import BeautifulSoup | |
from reportlab.lib import colors | |
from reportlab.lib.pagesizes import letter | |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, KeepTogether | |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
from reportlab.lib.units import inch | |
from email.utils import getaddresses, parsedate_to_datetime | |
from reportlab.pdfgen import canvas | |
from reportlab.platypus.flowables import Flowable | |
from html import unescape, escape | |
import re | |
import pkg_resources | |
import datetime | |
import json | |
import markdown | |
from tabulate import tabulate | |
from html5lib import HTMLParser, parse | |
import subprocess | |
import sys | |
from xhtml2pdf import pisa | |
"""JSON serializer for objects not serializable by default json code""" | |
def json_serial(obj): | |
if isinstance(obj, datetime.datetime): | |
serial = obj.isoformat() | |
return serial | |
""" | |
Extract content from an email file (.eml) | |
Args: | |
eml_file (str): Path to the .eml file | |
Returns: | |
tuple: (date, html_content, subject, reply_count) | |
""" | |
def extract_email_content(eml_file): | |
with open(eml_file, 'rb') as email_file: | |
email_message = BytesParser(policy=policy.default).parse(email_file) | |
# Extract header information | |
sender = email_message['From'] | |
subject = email_message['Subject'] | |
date = email_message['Date'] | |
message_id = email_message['Message-ID'] | |
# Extract attachments | |
attachments = [] | |
for part in email_message.walk(): | |
if part.get_content_maintype() == 'multipart': | |
continue | |
if part.get('Content-Disposition') is None: | |
continue | |
filename = part.get_filename() | |
if filename: | |
size = len(part.get_payload(decode=True)) | |
size_str = f"{size / 1024:.2f} KB" if size < 1024 * 1024 else f"{size / (1024 * 1024):.2f} MB" | |
attachments.append(f"{filename} ({size_str})") | |
# Extract and save attachment | |
attachments_directory = os.path.join(BASE_DIRECTORY, ATTACHMENTS_DIRECTORY) | |
if not os.path.exists(attachments_directory): | |
os.makedirs(attachments_directory) | |
filepath = os.path.join(attachments_directory, filename) | |
with open(filepath, 'wb') as f: | |
f.write(part.get_payload(decode=True)) | |
print(f"Saved attachment: {filepath}") | |
content = [] | |
private_message_pattern = re.compile(PRIVATE_MESSAGE_PATTERN, re.DOTALL) | |
restricted_data_pattern = re.compile(RESTRICTED_DATA_PATTERN, re.DOTALL) | |
# Process multipart emails | |
if email_message.is_multipart(): | |
for part in email_message.walk(): | |
if part.get_content_type() == 'text/plain': | |
part_content = part.get_payload(decode=True).decode() | |
part_content = private_message_pattern.sub('', part_content) | |
part_content = restricted_data_pattern.sub('', part_content) | |
content.append(part_content) | |
elif part.get_content_type() == 'text/html': | |
html_content = part.get_payload(decode=True).decode() | |
soup = BeautifulSoup(html_content, 'html.parser') | |
for img in soup.find_all('img'): | |
img.decompose() | |
for p in soup.find_all('p'): | |
if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START): | |
p.decompose() | |
plain_content = soup.get_text() | |
plain_content = private_message_pattern.sub('', plain_content) | |
plain_content = restricted_data_pattern.sub('', plain_content) | |
content.append(plain_content) | |
else: | |
# Process non-multipart emails | |
if email_message.get_content_type() == 'text/plain': | |
email_content = email_message.get_payload(decode=True).decode() | |
email_content = private_message_pattern.sub('', email_content) | |
email_content = restricted_data_pattern.sub('', email_content) | |
content.append(email_content) | |
elif email_message.get_content_type() == 'text/html': | |
html_content = email_message.get_payload(decode=True).decode() | |
soup = BeautifulSoup(html_content, 'html.parser') | |
for img in soup.find_all('img'): | |
img.decompose() | |
for p in soup.find_all('p'): | |
if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START): | |
p.decompose() | |
plain_content = soup.get_text() | |
plain_content = private_message_pattern.sub('', plain_content) | |
plain_content = restricted_data_pattern.sub('', plain_content) | |
content.append(plain_content) | |
# Create header in markdown format | |
header = "# Email Details\n\n" | |
header += f"| Field | Value |\n|-------|-------|\n" | |
header += f"| From | {sender} |\n" | |
header += f"| Subject | {subject} |\n" | |
header += f"| Date | {date} |\n" | |
header += f"| Message-ID | {message_id} |\n" | |
if attachments: | |
header += f"| Attachments | {', '.join(attachments)} |\n" | |
full_content = header + '\n\n## Email Body\n\n' + '\n'.join(content) | |
# Convert markdown to HTML with advanced styling | |
html_content = markdown.markdown(full_content, extensions=['tables', 'fenced_code', 'codehilite']) | |
"""Count the number of replies in an email body""" | |
def count_replies(body): | |
wrote_pattern = r"wrote:" | |
matches = re.findall(wrote_pattern, body, re.IGNORECASE) | |
return len(matches) | |
# Count replies | |
reply_count = count_replies('\n'.join(content)) | |
return date, html_content, subject, reply_count | |
""" | |
Save email content as an HTML file with advanced styling | |
Args: | |
email_content (str): HTML content of the email | |
output_file (str): Path to save the HTML file | |
""" | |
def save_email_content(email_content, output_file): | |
# Add some advanced styling | |
styled_html = f""" | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Email Content</title> | |
<style> | |
body {{ | |
font-family: Arial, sans-serif; | |
line-height: 1.6; | |
color: #333; | |
max-width: 800px; | |
margin: 0 auto; | |
padding: 20px; | |
}} | |
h1, h2, h3 {{ | |
color: #2c3e50; | |
}} | |
table {{ | |
border-collapse: collapse; | |
width: 100%; | |
margin-bottom: 20px; | |
}} | |
th, td {{ | |
border: 1px solid #ddd; | |
padding: 12px; | |
text-align: left; | |
}} | |
tr:nth-child(even) {{ | |
background-color: #f2f2f2; | |
}} | |
th {{ | |
background-color: #3498db; | |
color: white; | |
}} | |
pre {{ | |
background-color: #f8f8f8; | |
border: 1px solid #ddd; | |
border-radius: 3px; | |
padding: 10px; | |
overflow-x: auto; | |
}} | |
code {{ | |
font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace; | |
font-size: 0.9em; | |
}} | |
blockquote {{ | |
border-left: 4px solid #3498db; | |
padding-left: 15px; | |
color: #777; | |
font-style: italic; | |
}} | |
</style> | |
</head> | |
<body> | |
{email_content} | |
</body> | |
</html> | |
""" | |
# Validate HTML | |
parser = HTMLParser(strict=True) | |
dom = parse(styled_html) | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write(styled_html) | |
""" | |
Save all emails to a single HTML file with advanced styling and an index | |
Args: | |
email_data (list): List of tuples containing email data | |
output_file (str): Path to save the combined HTML file | |
""" | |
def save_all_emails_to_one_file(email_data, output_file): | |
# Add some advanced styling | |
combined_html = f""" | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Combined Emails</title> | |
<style> | |
body {{ | |
font-family: Arial, sans-serif; | |
line-height: 1.6; | |
color: #333; | |
width: 100%; | |
margin: 0; | |
padding: 20px; | |
box-sizing: border-box; | |
}} | |
.container {{ | |
max-width: 100%; | |
margin: 0 auto; | |
}} | |
h1, h2, h3 {{ | |
color: #2c3e50; | |
}} | |
table {{ | |
border-collapse: collapse; | |
width: 80%; | |
margin-bottom: 20px; | |
}} | |
th, td {{ | |
border: 1px solid #ddd; | |
padding: 12px; | |
text-align: left; | |
}} | |
tr:nth-child(even) {{ | |
background-color: #f2f2f2; | |
}} | |
th {{ | |
background-color: #3498db; | |
color: white; | |
}} | |
pre {{ | |
background-color: #f8f8f8; | |
border: 1px solid #ddd; | |
border-radius: 3px; | |
padding: 10px; | |
overflow-x: auto; | |
white-space: pre-wrap; | |
word-wrap: break-word; | |
}} | |
code {{ | |
font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace; | |
font-size: 0.9em; | |
}} | |
blockquote {{ | |
border-left: 4px solid #3498db; | |
padding-left: 15px; | |
color: #777; | |
font-style: italic; | |
}} | |
a {{ | |
color: #3498db; | |
text-decoration: none; | |
}} | |
a:hover {{ | |
text-decoration: underline; | |
}} | |
</style> | |
</head> | |
<body> | |
<h1>Email Index</h1> | |
<table> | |
<tr><th>#</th><th>Date</th><th>Subject</th><th>EML File</th><th>Replies</th></tr> | |
""" | |
for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1): | |
formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S') | |
combined_html += f"<tr><td>{idx}</td><td>{formatted_date}</td><td><a href='#email-{idx}'>{subject}</a></td><td>{filename}</td><td>{reply_count}</td></tr>\n" | |
combined_html += "</table><hr>\n" | |
for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1): | |
formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S') | |
combined_html += f"<h2 id='email-{idx}'>{idx}. Email from {formatted_date}</h2>\n<hr>\n{email_content}\n<br><br>\n" | |
combined_html += """ | |
</body> | |
</html> | |
""" | |
# Validate HTML | |
parser = HTMLParser(strict=True) | |
dom = parse(combined_html) | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write(combined_html) | |
""" | |
Convert HTML file to Markdown using Pandoc | |
Args: | |
html_file (str): Path to the input HTML file | |
md_file (str): Path to save the output Markdown file | |
""" | |
def convert_to_markdown(html_file, md_file): | |
try: | |
subprocess.run(['pandoc', '-f', 'html', '-t', 'markdown', '-o', md_file, html_file], check=True) | |
print(f"Successfully converted {html_file} to {md_file}") | |
except subprocess.CalledProcessError as e: | |
print(f"Error converting {html_file} to Markdown: {e}") | |
except FileNotFoundError: | |
print("Error: Pandoc is not installed or not in the system PATH.") | |
print("Please install Pandoc and make sure it's accessible from the command line.") | |
print("You can install Pandoc using the following steps:") | |
print("1. For Ubuntu/Debian: sudo apt-get install pandoc") | |
print("2. For macOS with Homebrew: brew install pandoc") | |
print("3. For Windows: Download the installer from https://pandoc.org/installing.html") | |
print("After installation, restart your terminal or IDE to update the PATH.") | |
sys.exit(1) | |
""" | |
Convert HTML file to PDF using pisa (xhtml2pdf) | |
Args: | |
html_file (str): Path to the input HTML file | |
pdf_file (str): Path to save the output PDF file | |
""" | |
def convert_to_pdf(html_file, pdf_file): | |
with open(html_file, 'r', encoding='utf-8') as f: | |
html_content = f.read() | |
with open(pdf_file, 'wb') as f: | |
pisa.CreatePDF(html_content, dest=f) | |
print(f"Successfully converted {html_file} to PDF: {pdf_file}") | |
""" | |
Convert HTML file to plain text | |
Args: | |
html_file (str): Path to the input HTML file | |
txt_file (str): Path to save the output text file | |
""" | |
def convert_to_text(html_file, txt_file): | |
with open(html_file, 'r', encoding='utf-8') as f: | |
soup = BeautifulSoup(f, 'html.parser') | |
text_content = soup.get_text() | |
with open(txt_file, 'w', encoding='utf-8') as f: | |
f.write(text_content) | |
print(f"Successfully converted {html_file} to text: {txt_file}") | |
""" | |
Convert HTML file to JSON | |
Args: | |
html_file (str): Path to the input HTML file | |
json_file (str): Path to save the output JSON file | |
""" | |
def convert_to_json(html_file, json_file): | |
with open(html_file, 'r', encoding='utf-8') as f: | |
soup = BeautifulSoup(f, 'html.parser') | |
data = { | |
'title': soup.title.string if soup.title else '', | |
'body': soup.body.get_text() if soup.body else '' | |
} | |
with open(json_file, 'w', encoding='utf-8') as f: | |
json.dump(data, f, ensure_ascii=False, indent=4) | |
print(f"Successfully converted {html_file} to JSON: {json_file}") | |
if __name__ == "__main__": | |
eml_directory = os.path.join(BASE_DIRECTORY, EML_DIRECTORY) | |
html_output_directory = os.path.join(BASE_DIRECTORY, HTML_OUTPUT_DIRECTORY) | |
markdown_output_directory = os.path.join(BASE_DIRECTORY, MARKDOWN_OUTPUT_DIRECTORY) | |
pdf_output_directory = os.path.join(BASE_DIRECTORY, PDF_OUTPUT_DIRECTORY) | |
combined_output_directory = os.path.join(BASE_DIRECTORY, COMBINED_OUTPUT_DIRECTORY) | |
text_output_directory = os.path.join(BASE_DIRECTORY, TEXT_OUTPUT_DIRECTORY) | |
json_output_directory = os.path.join(BASE_DIRECTORY, JSON_OUTPUT_DIRECTORY) | |
# Create output directories if they don't exist | |
for directory in [html_output_directory, markdown_output_directory, pdf_output_directory, | |
combined_output_directory, text_output_directory, json_output_directory]: | |
os.makedirs(directory, exist_ok=True) | |
combined_output_file = os.path.join(combined_output_directory, "combined_emails.html") | |
combined_md_file = os.path.join(combined_output_directory, "combined_emails.md") | |
combined_pdf_file = os.path.join(combined_output_directory, "combined_emails.pdf") | |
combined_txt_file = os.path.join(combined_output_directory, "combined_emails.txt") | |
combined_json_file = os.path.join(combined_output_directory, "combined_emails.json") | |
email_data = [] | |
for filename in os.listdir(eml_directory): | |
if filename.endswith(".eml"): | |
eml_file = os.path.join(eml_directory, filename) | |
date, email_content, subject, reply_count = extract_email_content(eml_file) | |
email_data.append((date, filename, email_content, subject, reply_count)) | |
output_file = os.path.join(html_output_directory, os.path.splitext(filename)[0] + ".html") | |
save_email_content(email_content, output_file) | |
print(f"Saved content of {filename} to {output_file}") | |
# Sort emails by date | |
email_data.sort(key=lambda x: email.utils.parsedate_to_datetime(x[0])) | |
save_all_emails_to_one_file(email_data, combined_output_file) | |
print(f"Saved combined content to {combined_output_file}") | |
# Convert HTML files to other formats | |
for filename in os.listdir(html_output_directory): | |
if filename.endswith(".html"): | |
html_file = os.path.join(html_output_directory, filename) | |
base_name = os.path.splitext(filename)[0] | |
md_file = os.path.join(markdown_output_directory, base_name + ".md") | |
convert_to_markdown(html_file, md_file) | |
pdf_file = os.path.join(pdf_output_directory, base_name + ".pdf") | |
convert_to_pdf(html_file, pdf_file) | |
txt_file = os.path.join(text_output_directory, base_name + ".txt") | |
convert_to_text(html_file, txt_file) | |
json_file = os.path.join(json_output_directory, base_name + ".json") | |
convert_to_json(html_file, json_file) | |
# Convert combined HTML file to other formats | |
convert_to_markdown(combined_output_file, combined_md_file) | |
print(f"Converted combined HTML to Markdown: {combined_md_file}") | |
convert_to_pdf(combined_output_file, combined_pdf_file) | |
print(f"Converted combined HTML to PDF: {combined_pdf_file}") | |
convert_to_text(combined_output_file, combined_txt_file) | |
print(f"Converted combined HTML to text: {combined_txt_file}") | |
convert_to_json(combined_output_file, combined_json_file) | |
print(f"Converted combined HTML to JSON: {combined_json_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment