io41 · March 18, 2025 05:01
diff --git a/eml2pdf.py b/eml2pdf.py
 # =========================================
 # Email to PDF Conversion Script
 # =========================================
 #
 # Version: 1.0
 # Script written by Warith Al Maawali
 # (c) 2024
 #
 # Discord channel: https://discord.gg/KEFErEx
 # Twitter: http://twitter.com/warith2020
 # Linkedin: http://www.linkedin.com/in/warith1977
 # Website: https://www.digi77.com
 #
 # This script converts email files (.eml) to PDF format.
 # It supports HTML, Markdown, and plain text content,
 # preserves formatting, and extracts attachments.
 #
 # The resulting PDF includes the email body and
 # information about any attachments present.
 #
 # This software is dual-licensed:
 #
 # Personal, non-commercial use: Apache License 2.0
 # Commercial, corporate, or organizational use: Separate commercial license required. 
 # Contact me for licensing inquiries.
 #
 # Usage: python eml2pdf.py
 # =========================================


 # Changeable variables and sensitive words
 BASE_DIRECTORY = "/xxxxxxx/Emails"
 EML_DIRECTORY = "eml files"
 HTML_OUTPUT_DIRECTORY = "html_output"
 MARKDOWN_OUTPUT_DIRECTORY = "markdown_output"
 PDF_OUTPUT_DIRECTORY = "pdf_output"
 COMBINED_OUTPUT_DIRECTORY = "combined_output"
 TEXT_OUTPUT_DIRECTORY = "text_output"
 JSON_OUTPUT_DIRECTORY = "json_output"
 ATTACHMENTS_DIRECTORY = "attachments"
 COMBINED_OUTPUT_FILENAME = "combined_emails"
 PRIVATE_MESSAGE_PATTERN = r'This is a PRIVATE message.*?purpose\.'
 RESTRICTED_DATA_PATTERN = r'External xxxxxxxx correspondence:.*?if it is obtained from another source without restriction\.'
 CONFIDENTIALITY_START = "CONFIDENTIALITY: This email and any accompa"

 # Import necessary libraries
 import os
 import email
 from email import policy
 from email.parser import BytesParser
 from bs4 import BeautifulSoup
 from reportlab.lib import colors
 from reportlab.lib.pagesizes import letter
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, KeepTogether
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.units import inch
 from email.utils import getaddresses, parsedate_to_datetime
 from reportlab.pdfgen import canvas
 from reportlab.platypus.flowables import Flowable
 from html import unescape, escape
 import re
 import pkg_resources
 import datetime
 import json
 import markdown
 from tabulate import tabulate
 from html5lib import HTMLParser, parse
 import subprocess
 import sys
 from xhtml2pdf import pisa

 """JSON serializer for objects not serializable by default json code"""
 def json_serial(obj):
    if isinstance(obj, datetime.datetime):
        serial = obj.isoformat()
        return serial

 """
 Extract content from an email file (.eml)

 Args:
 eml_file (str): Path to the .eml file

 Returns:
 tuple: (date, html_content, subject, reply_count)
 """
 def extract_email_content(eml_file):
    with open(eml_file, 'rb') as email_file:
        email_message = BytesParser(policy=policy.default).parse(email_file)
    
    # Extract header information
    sender = email_message['From']
    subject = email_message['Subject']
    date = email_message['Date']
    message_id = email_message['Message-ID']

    # Extract attachments
    attachments = []
    for part in email_message.walk():
        if part.get_content_maintype() == 'multipart':
            continue
        if part.get('Content-Disposition') is None:
            continue
        filename = part.get_filename()
        if filename:
            size = len(part.get_payload(decode=True))
            size_str = f"{size / 1024:.2f} KB" if size < 1024 * 1024 else f"{size / (1024 * 1024):.2f} MB"
            attachments.append(f"{filename} ({size_str})")
            
            # Extract and save attachment
            attachments_directory = os.path.join(BASE_DIRECTORY, ATTACHMENTS_DIRECTORY)
            if not os.path.exists(attachments_directory):
                os.makedirs(attachments_directory)
            filepath = os.path.join(attachments_directory, filename)
            with open(filepath, 'wb') as f:
                f.write(part.get_payload(decode=True))
            print(f"Saved attachment: {filepath}")

    content = []
    private_message_pattern = re.compile(PRIVATE_MESSAGE_PATTERN, re.DOTALL)
    restricted_data_pattern = re.compile(RESTRICTED_DATA_PATTERN, re.DOTALL)
    
    # Process multipart emails
    if email_message.is_multipart():
        for part in email_message.walk():
            if part.get_content_type() == 'text/plain':
                part_content = part.get_payload(decode=True).decode()
                part_content = private_message_pattern.sub('', part_content)
                part_content = restricted_data_pattern.sub('', part_content)
                content.append(part_content)
            elif part.get_content_type() == 'text/html':
                html_content = part.get_payload(decode=True).decode()
                soup = BeautifulSoup(html_content, 'html.parser')
                for img in soup.find_all('img'):
                    img.decompose()
                for p in soup.find_all('p'):
                    if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START):
                        p.decompose()
                plain_content = soup.get_text()
                plain_content = private_message_pattern.sub('', plain_content)
                plain_content = restricted_data_pattern.sub('', plain_content)
                content.append(plain_content)
    else:
        # Process non-multipart emails
        if email_message.get_content_type() == 'text/plain':
            email_content = email_message.get_payload(decode=True).decode()
            email_content = private_message_pattern.sub('', email_content)
            email_content = restricted_data_pattern.sub('', email_content)
            content.append(email_content)
        elif email_message.get_content_type() == 'text/html':
            html_content = email_message.get_payload(decode=True).decode()
            soup = BeautifulSoup(html_content, 'html.parser')
            for img in soup.find_all('img'):
                img.decompose()
            for p in soup.find_all('p'):
                if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START):
                    p.decompose()
            plain_content = soup.get_text()
            plain_content = private_message_pattern.sub('', plain_content)
            plain_content = restricted_data_pattern.sub('', plain_content)
            content.append(plain_content)
    
    # Create header in markdown format
    header = "# Email Details\n\n"
    header += f"| Field | Value |\n|-------|-------|\n"
    header += f"| From | {sender} |\n"
    header += f"| Subject | {subject} |\n"
    header += f"| Date | {date} |\n"
    header += f"| Message-ID | {message_id} |\n"
    if attachments:
        header += f"| Attachments | {', '.join(attachments)} |\n"

    full_content = header + '\n\n## Email Body\n\n' + '\n'.join(content)
    
    # Convert markdown to HTML with advanced styling
    html_content = markdown.markdown(full_content, extensions=['tables', 'fenced_code', 'codehilite'])

    """Count the number of replies in an email body"""
    def count_replies(body):
        wrote_pattern = r"wrote:"
        matches = re.findall(wrote_pattern, body, re.IGNORECASE)
        return len(matches)

    # Count replies
    reply_count = count_replies('\n'.join(content))

    return date, html_content, subject, reply_count

 """
 Save email content as an HTML file with advanced styling

 Args:
 email_content (str): HTML content of the email
 output_file (str): Path to save the HTML file
 """
 def save_email_content(email_content, output_file):
    # Add some advanced styling
    styled_html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Email Content</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                line-height: 1.6;
                color: #333;
                max-width: 800px;
                margin: 0 auto;
                padding: 20px;
            }}
            h1, h2, h3 {{
                color: #2c3e50;
            }}
            table {{
                border-collapse: collapse;
                width: 100%;
                margin-bottom: 20px;
            }}
            th, td {{
                border: 1px solid #ddd;
                padding: 12px;
                text-align: left;
            }}
            tr:nth-child(even) {{
                background-color: #f2f2f2;
            }}
            th {{
                background-color: #3498db;
                color: white;
            }}
            pre {{
                background-color: #f8f8f8;
                border: 1px solid #ddd;
                border-radius: 3px;
                padding: 10px;
                overflow-x: auto;
            }}
            code {{
                font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
                font-size: 0.9em;
            }}
            blockquote {{
                border-left: 4px solid #3498db;
                padding-left: 15px;
                color: #777;
                font-style: italic;
            }}
        </style>
    </head>
    <body>
    {email_content}
    </body>
    </html>
    """
    
    # Validate HTML
    parser = HTMLParser(strict=True)
    dom = parse(styled_html)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(styled_html)

 """
 Save all emails to a single HTML file with advanced styling and an index

 Args:
 email_data (list): List of tuples containing email data
 output_file (str): Path to save the combined HTML file
 """
 def save_all_emails_to_one_file(email_data, output_file):
    # Add some advanced styling
    combined_html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Combined Emails</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                line-height: 1.6;
                color: #333;
                width: 100%;
                margin: 0;
                padding: 20px;
                box-sizing: border-box;
            }}
            .container {{
                max-width: 100%;
                margin: 0 auto;
            }}
            h1, h2, h3 {{
                color: #2c3e50;
            }}
            table {{
                border-collapse: collapse;
                width: 80%;
                margin-bottom: 20px;
            }}
            th, td {{
                border: 1px solid #ddd;
                padding: 12px;
                text-align: left;
            }}
            tr:nth-child(even) {{
                background-color: #f2f2f2;
            }}
            th {{
                background-color: #3498db;
                color: white;
            }}
            pre {{
                background-color: #f8f8f8;
                border: 1px solid #ddd;
                border-radius: 3px;
                padding: 10px;
                overflow-x: auto;
                white-space: pre-wrap;
                word-wrap: break-word;
            }}
            code {{
                font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
                font-size: 0.9em;
            }}
            blockquote {{
                border-left: 4px solid #3498db;
                padding-left: 15px;
                color: #777;
                font-style: italic;
            }}
            a {{
                color: #3498db;
                text-decoration: none;
            }}
            a:hover {{
                text-decoration: underline;
            }}
        </style>
    </head>
    <body>
    <h1>Email Index</h1>
    <table>
    <tr><th>#</th><th>Date</th><th>Subject</th><th>EML File</th><th>Replies</th></tr>
    """
    for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1):
        formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S')
        combined_html += f"<tr><td>{idx}</td><td>{formatted_date}</td><td><a href='#email-{idx}'>{subject}</a></td><td>{filename}</td><td>{reply_count}</td></tr>\n"
    
    combined_html += "</table><hr>\n"
    
    for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1):
        formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S')
        combined_html += f"<h2 id='email-{idx}'>{idx}. Email from {formatted_date}</h2>\n<hr>\n{email_content}\n<br><br>\n"

    combined_html += """
    </body>
    </html>
    """
    
    # Validate HTML
    parser = HTMLParser(strict=True)
    dom = parse(combined_html)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(combined_html)

 """
 Convert HTML file to Markdown using Pandoc

 Args:
 html_file (str): Path to the input HTML file
 md_file (str): Path to save the output Markdown file
 """
 def convert_to_markdown(html_file, md_file):
    try:
        subprocess.run(['pandoc', '-f', 'html', '-t', 'markdown', '-o', md_file, html_file], check=True)
        print(f"Successfully converted {html_file} to {md_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error converting {html_file} to Markdown: {e}")
    except FileNotFoundError:
        print("Error: Pandoc is not installed or not in the system PATH.")
        print("Please install Pandoc and make sure it's accessible from the command line.")
        print("You can install Pandoc using the following steps:")
        print("1. For Ubuntu/Debian: sudo apt-get install pandoc")
        print("2. For macOS with Homebrew: brew install pandoc")
        print("3. For Windows: Download the installer from https://pandoc.org/installing.html")
        print("After installation, restart your terminal or IDE to update the PATH.")
        sys.exit(1)

 """
 Convert HTML file to PDF using pisa (xhtml2pdf)

 Args:
 html_file (str): Path to the input HTML file
 pdf_file (str): Path to save the output PDF file
 """
 def convert_to_pdf(html_file, pdf_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    with open(pdf_file, 'wb') as f:
        pisa.CreatePDF(html_content, dest=f)

    print(f"Successfully converted {html_file} to PDF: {pdf_file}")

 """
 Convert HTML file to plain text

 Args:
 html_file (str): Path to the input HTML file
 txt_file (str): Path to save the output text file
 """
 def convert_to_text(html_file, txt_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        text_content = soup.get_text()
    
    with open(txt_file, 'w', encoding='utf-8') as f:
        f.write(text_content)
    
    print(f"Successfully converted {html_file} to text: {txt_file}")

 """
 Convert HTML file to JSON

 Args:
 html_file (str): Path to the input HTML file
 json_file (str): Path to save the output JSON file
 """
 def convert_to_json(html_file, json_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        data = {
            'title': soup.title.string if soup.title else '',
            'body': soup.body.get_text() if soup.body else ''
        }
    
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Successfully converted {html_file} to JSON: {json_file}")

 if __name__ == "__main__":
    eml_directory = os.path.join(BASE_DIRECTORY, EML_DIRECTORY)
    html_output_directory = os.path.join(BASE_DIRECTORY, HTML_OUTPUT_DIRECTORY)
    markdown_output_directory = os.path.join(BASE_DIRECTORY, MARKDOWN_OUTPUT_DIRECTORY)
    pdf_output_directory = os.path.join(BASE_DIRECTORY, PDF_OUTPUT_DIRECTORY)
    combined_output_directory = os.path.join(BASE_DIRECTORY, COMBINED_OUTPUT_DIRECTORY)
    text_output_directory = os.path.join(BASE_DIRECTORY, TEXT_OUTPUT_DIRECTORY)
    json_output_directory = os.path.join(BASE_DIRECTORY, JSON_OUTPUT_DIRECTORY)
    
    # Create output directories if they don't exist
    for directory in [html_output_directory, markdown_output_directory, pdf_output_directory, 
                      combined_output_directory, text_output_directory, json_output_directory]:
        os.makedirs(directory, exist_ok=True)
    
    combined_output_file = os.path.join(combined_output_directory, "combined_emails.html")
    combined_md_file = os.path.join(combined_output_directory, "combined_emails.md")
    combined_pdf_file = os.path.join(combined_output_directory, "combined_emails.pdf")
    combined_txt_file = os.path.join(combined_output_directory, "combined_emails.txt")
    combined_json_file = os.path.join(combined_output_directory, "combined_emails.json")
    
    email_data = []
    for filename in os.listdir(eml_directory):
        if filename.endswith(".eml"):
            eml_file = os.path.join(eml_directory, filename)
            date, email_content, subject, reply_count = extract_email_content(eml_file)
            email_data.append((date, filename, email_content, subject, reply_count))
            output_file = os.path.join(html_output_directory, os.path.splitext(filename)[0] + ".html")
            save_email_content(email_content, output_file)
            print(f"Saved content of {filename} to {output_file}")
    
    # Sort emails by date
    email_data.sort(key=lambda x: email.utils.parsedate_to_datetime(x[0]))
    
    save_all_emails_to_one_file(email_data, combined_output_file)
    print(f"Saved combined content to {combined_output_file}")

    # Convert HTML files to other formats
    for filename in os.listdir(html_output_directory):
        if filename.endswith(".html"):
            html_file = os.path.join(html_output_directory, filename)
            base_name = os.path.splitext(filename)[0]
            
            md_file = os.path.join(markdown_output_directory, base_name + ".md")
            convert_to_markdown(html_file, md_file)
            
            pdf_file = os.path.join(pdf_output_directory, base_name + ".pdf")
            convert_to_pdf(html_file, pdf_file)
            
            txt_file = os.path.join(text_output_directory, base_name + ".txt")
            convert_to_text(html_file, txt_file)
            
            json_file = os.path.join(json_output_directory, base_name + ".json")
            convert_to_json(html_file, json_file)

    # Convert combined HTML file to other formats
    convert_to_markdown(combined_output_file, combined_md_file)
    print(f"Converted combined HTML to Markdown: {combined_md_file}")

    convert_to_pdf(combined_output_file, combined_pdf_file)
    print(f"Converted combined HTML to PDF: {combined_pdf_file}")

    convert_to_text(combined_output_file, combined_txt_file)
    print(f"Converted combined HTML to text: {combined_txt_file}")

    convert_to_json(combined_output_file, combined_json_file)
    print(f"Converted combined HTML to JSON: {combined_json_file}")
	# =========================================
	# Email to PDF Conversion Script
	# =========================================
	#
	# Version: 1.0
	# Script written by Warith Al Maawali
	# (c) 2024
	#
	# Discord channel: https://discord.gg/KEFErEx
	# Twitter: http://twitter.com/warith2020
	# Linkedin: http://www.linkedin.com/in/warith1977
	# Website: https://www.digi77.com
	#
	# This script converts email files (.eml) to PDF format.
	# It supports HTML, Markdown, and plain text content,
	# preserves formatting, and extracts attachments.
	#
	# The resulting PDF includes the email body and
	# information about any attachments present.
	#
	# This software is dual-licensed:
	#
	# Personal, non-commercial use: Apache License 2.0
	# Commercial, corporate, or organizational use: Separate commercial license required.
	# Contact me for licensing inquiries.
	#
	# Usage: python eml2pdf.py
	# =========================================


	# Changeable variables and sensitive words
	BASE_DIRECTORY = "/xxxxxxx/Emails"
	EML_DIRECTORY = "eml files"
	HTML_OUTPUT_DIRECTORY = "html_output"
	MARKDOWN_OUTPUT_DIRECTORY = "markdown_output"
	PDF_OUTPUT_DIRECTORY = "pdf_output"
	COMBINED_OUTPUT_DIRECTORY = "combined_output"
	TEXT_OUTPUT_DIRECTORY = "text_output"
	JSON_OUTPUT_DIRECTORY = "json_output"
	ATTACHMENTS_DIRECTORY = "attachments"
	COMBINED_OUTPUT_FILENAME = "combined_emails"
	PRIVATE_MESSAGE_PATTERN = r'This is a PRIVATE message.*?purpose\.'
	RESTRICTED_DATA_PATTERN = r'External xxxxxxxx correspondence:.*?if it is obtained from another source without restriction\.'
	CONFIDENTIALITY_START = "CONFIDENTIALITY: This email and any accompa"

	# Import necessary libraries
	import os
	import email
	from email import policy
	from email.parser import BytesParser
	from bs4 import BeautifulSoup
	from reportlab.lib import colors
	from reportlab.lib.pagesizes import letter
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, KeepTogether
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.units import inch
	from email.utils import getaddresses, parsedate_to_datetime
	from reportlab.pdfgen import canvas
	from reportlab.platypus.flowables import Flowable
	from html import unescape, escape
	import re
	import pkg_resources
	import datetime
	import json
	import markdown
	from tabulate import tabulate
	from html5lib import HTMLParser, parse
	import subprocess
	import sys
	from xhtml2pdf import pisa

	"""JSON serializer for objects not serializable by default json code"""
	def json_serial(obj):
	if isinstance(obj, datetime.datetime):
	serial = obj.isoformat()
	return serial

	"""
	Extract content from an email file (.eml)

	Args:
	eml_file (str): Path to the .eml file

	Returns:
	tuple: (date, html_content, subject, reply_count)
	"""
	def extract_email_content(eml_file):
	with open(eml_file, 'rb') as email_file:
	email_message = BytesParser(policy=policy.default).parse(email_file)

	# Extract header information
	sender = email_message['From']
	subject = email_message['Subject']
	date = email_message['Date']
	message_id = email_message['Message-ID']

	# Extract attachments
	attachments = []
	for part in email_message.walk():
	if part.get_content_maintype() == 'multipart':
	continue
	if part.get('Content-Disposition') is None:
	continue
	filename = part.get_filename()
	if filename:
	size = len(part.get_payload(decode=True))
	size_str = f"{size / 1024:.2f} KB" if size < 1024 * 1024 else f"{size / (1024 * 1024):.2f} MB"
	attachments.append(f"{filename} ({size_str})")

	# Extract and save attachment
	attachments_directory = os.path.join(BASE_DIRECTORY, ATTACHMENTS_DIRECTORY)
	if not os.path.exists(attachments_directory):
	os.makedirs(attachments_directory)
	filepath = os.path.join(attachments_directory, filename)
	with open(filepath, 'wb') as f:
	f.write(part.get_payload(decode=True))
	print(f"Saved attachment: {filepath}")

	content = []
	private_message_pattern = re.compile(PRIVATE_MESSAGE_PATTERN, re.DOTALL)
	restricted_data_pattern = re.compile(RESTRICTED_DATA_PATTERN, re.DOTALL)

	# Process multipart emails
	if email_message.is_multipart():
	for part in email_message.walk():
	if part.get_content_type() == 'text/plain':
	part_content = part.get_payload(decode=True).decode()
	part_content = private_message_pattern.sub('', part_content)
	part_content = restricted_data_pattern.sub('', part_content)
	content.append(part_content)
	elif part.get_content_type() == 'text/html':
	html_content = part.get_payload(decode=True).decode()
	soup = BeautifulSoup(html_content, 'html.parser')
	for img in soup.find_all('img'):
	img.decompose()
	for p in soup.find_all('p'):
	if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START):
	p.decompose()
	plain_content = soup.get_text()
	plain_content = private_message_pattern.sub('', plain_content)
	plain_content = restricted_data_pattern.sub('', plain_content)
	content.append(plain_content)
	else:
	# Process non-multipart emails
	if email_message.get_content_type() == 'text/plain':
	email_content = email_message.get_payload(decode=True).decode()
	email_content = private_message_pattern.sub('', email_content)
	email_content = restricted_data_pattern.sub('', email_content)
	content.append(email_content)
	elif email_message.get_content_type() == 'text/html':
	html_content = email_message.get_payload(decode=True).decode()
	soup = BeautifulSoup(html_content, 'html.parser')
	for img in soup.find_all('img'):
	img.decompose()
	for p in soup.find_all('p'):
	if p.text.startswith("National Security Services Group ") or p.text.startswith(CONFIDENTIALITY_START):
	p.decompose()
	plain_content = soup.get_text()
	plain_content = private_message_pattern.sub('', plain_content)
	plain_content = restricted_data_pattern.sub('', plain_content)
	content.append(plain_content)

	# Create header in markdown format
	header = "# Email Details\n\n"
	header += f"\| Field \| Value \|\n\|-------\|-------\|\n"
	header += f"\| From \| {sender} \|\n"
	header += f"\| Subject \| {subject} \|\n"
	header += f"\| Date \| {date} \|\n"
	header += f"\| Message-ID \| {message_id} \|\n"
	if attachments:
	header += f"\| Attachments \| {', '.join(attachments)} \|\n"

	full_content = header + '\n\n## Email Body\n\n' + '\n'.join(content)

	# Convert markdown to HTML with advanced styling
	html_content = markdown.markdown(full_content, extensions=['tables', 'fenced_code', 'codehilite'])

	"""Count the number of replies in an email body"""
	def count_replies(body):
	wrote_pattern = r"wrote:"
	matches = re.findall(wrote_pattern, body, re.IGNORECASE)
	return len(matches)

	# Count replies
	reply_count = count_replies('\n'.join(content))

	return date, html_content, subject, reply_count

	"""
	Save email content as an HTML file with advanced styling

	Args:
	email_content (str): HTML content of the email
	output_file (str): Path to save the HTML file
	"""
	def save_email_content(email_content, output_file):
	# Add some advanced styling
	styled_html = f"""
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Email Content</title>
	<style>
	body {{
	font-family: Arial, sans-serif;
	line-height: 1.6;
	color: #333;
	max-width: 800px;
	margin: 0 auto;
	padding: 20px;
	}}
	h1, h2, h3 {{
	color: #2c3e50;
	}}
	table {{
	border-collapse: collapse;
	width: 100%;
	margin-bottom: 20px;
	}}
	th, td {{
	border: 1px solid #ddd;
	padding: 12px;
	text-align: left;
	}}
	tr:nth-child(even) {{
	background-color: #f2f2f2;
	}}
	th {{
	background-color: #3498db;
	color: white;
	}}
	pre {{
	background-color: #f8f8f8;
	border: 1px solid #ddd;
	border-radius: 3px;
	padding: 10px;
	overflow-x: auto;
	}}
	code {{
	font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
	font-size: 0.9em;
	}}
	blockquote {{
	border-left: 4px solid #3498db;
	padding-left: 15px;
	color: #777;
	font-style: italic;
	}}
	</style>
	</head>
	<body>
	{email_content}
	</body>
	</html>
	"""

	# Validate HTML
	parser = HTMLParser(strict=True)
	dom = parse(styled_html)

	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(styled_html)

	"""
	Save all emails to a single HTML file with advanced styling and an index

	Args:
	email_data (list): List of tuples containing email data
	output_file (str): Path to save the combined HTML file
	"""
	def save_all_emails_to_one_file(email_data, output_file):
	# Add some advanced styling
	combined_html = f"""
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Combined Emails</title>
	<style>
	body {{
	font-family: Arial, sans-serif;
	line-height: 1.6;
	color: #333;
	width: 100%;
	margin: 0;
	padding: 20px;
	box-sizing: border-box;
	}}
	.container {{
	max-width: 100%;
	margin: 0 auto;
	}}
	h1, h2, h3 {{
	color: #2c3e50;
	}}
	table {{
	border-collapse: collapse;
	width: 80%;
	margin-bottom: 20px;
	}}
	th, td {{
	border: 1px solid #ddd;
	padding: 12px;
	text-align: left;
	}}
	tr:nth-child(even) {{
	background-color: #f2f2f2;
	}}
	th {{
	background-color: #3498db;
	color: white;
	}}
	pre {{
	background-color: #f8f8f8;
	border: 1px solid #ddd;
	border-radius: 3px;
	padding: 10px;
	overflow-x: auto;
	white-space: pre-wrap;
	word-wrap: break-word;
	}}
	code {{
	font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace;
	font-size: 0.9em;
	}}
	blockquote {{
	border-left: 4px solid #3498db;
	padding-left: 15px;
	color: #777;
	font-style: italic;
	}}
	a {{
	color: #3498db;
	text-decoration: none;
	}}
	a:hover {{
	text-decoration: underline;
	}}
	</style>
	</head>
	<body>
	<h1>Email Index</h1>
	<table>
	<tr><th>#</th><th>Date</th><th>Subject</th><th>EML File</th><th>Replies</th></tr>
	"""
	for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1):
	formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S')
	combined_html += f"<tr><td>{idx}</td><td>{formatted_date}</td><td><a href='#email-{idx}'>{subject}</a></td><td>{filename}</td><td>{reply_count}</td></tr>\n"

	combined_html += "</table><hr>\n"

	for idx, (date, filename, email_content, subject, reply_count) in enumerate(email_data, 1):
	formatted_date = parsedate_to_datetime(date).strftime('%Y-%m-%d %H:%M:%S')
	combined_html += f"<h2 id='email-{idx}'>{idx}. Email from {formatted_date}</h2>\n<hr>\n{email_content}\n<br><br>\n"

	combined_html += """
	</body>
	</html>
	"""

	# Validate HTML
	parser = HTMLParser(strict=True)
	dom = parse(combined_html)

	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(combined_html)

	"""
	Convert HTML file to Markdown using Pandoc

	Args:
	html_file (str): Path to the input HTML file
	md_file (str): Path to save the output Markdown file
	"""
	def convert_to_markdown(html_file, md_file):
	try:
	subprocess.run(['pandoc', '-f', 'html', '-t', 'markdown', '-o', md_file, html_file], check=True)
	print(f"Successfully converted {html_file} to {md_file}")
	except subprocess.CalledProcessError as e:
	print(f"Error converting {html_file} to Markdown: {e}")
	except FileNotFoundError:
	print("Error: Pandoc is not installed or not in the system PATH.")
	print("Please install Pandoc and make sure it's accessible from the command line.")
	print("You can install Pandoc using the following steps:")
	print("1. For Ubuntu/Debian: sudo apt-get install pandoc")
	print("2. For macOS with Homebrew: brew install pandoc")
	print("3. For Windows: Download the installer from https://pandoc.org/installing.html")
	print("After installation, restart your terminal or IDE to update the PATH.")
	sys.exit(1)

	"""
	Convert HTML file to PDF using pisa (xhtml2pdf)

	Args:
	html_file (str): Path to the input HTML file
	pdf_file (str): Path to save the output PDF file
	"""
	def convert_to_pdf(html_file, pdf_file):
	with open(html_file, 'r', encoding='utf-8') as f:
	html_content = f.read()

	with open(pdf_file, 'wb') as f:
	pisa.CreatePDF(html_content, dest=f)

	print(f"Successfully converted {html_file} to PDF: {pdf_file}")

	"""
	Convert HTML file to plain text

	Args:
	html_file (str): Path to the input HTML file
	txt_file (str): Path to save the output text file
	"""
	def convert_to_text(html_file, txt_file):
	with open(html_file, 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f, 'html.parser')
	text_content = soup.get_text()

	with open(txt_file, 'w', encoding='utf-8') as f:
	f.write(text_content)

	print(f"Successfully converted {html_file} to text: {txt_file}")

	"""
	Convert HTML file to JSON

	Args:
	html_file (str): Path to the input HTML file
	json_file (str): Path to save the output JSON file
	"""
	def convert_to_json(html_file, json_file):
	with open(html_file, 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f, 'html.parser')
	data = {
	'title': soup.title.string if soup.title else '',
	'body': soup.body.get_text() if soup.body else ''
	}

	with open(json_file, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=4)

	print(f"Successfully converted {html_file} to JSON: {json_file}")

	if __name__ == "__main__":
	eml_directory = os.path.join(BASE_DIRECTORY, EML_DIRECTORY)
	html_output_directory = os.path.join(BASE_DIRECTORY, HTML_OUTPUT_DIRECTORY)
	markdown_output_directory = os.path.join(BASE_DIRECTORY, MARKDOWN_OUTPUT_DIRECTORY)
	pdf_output_directory = os.path.join(BASE_DIRECTORY, PDF_OUTPUT_DIRECTORY)
	combined_output_directory = os.path.join(BASE_DIRECTORY, COMBINED_OUTPUT_DIRECTORY)
	text_output_directory = os.path.join(BASE_DIRECTORY, TEXT_OUTPUT_DIRECTORY)
	json_output_directory = os.path.join(BASE_DIRECTORY, JSON_OUTPUT_DIRECTORY)

	# Create output directories if they don't exist
	for directory in [html_output_directory, markdown_output_directory, pdf_output_directory,
	combined_output_directory, text_output_directory, json_output_directory]:
	os.makedirs(directory, exist_ok=True)

	combined_output_file = os.path.join(combined_output_directory, "combined_emails.html")
	combined_md_file = os.path.join(combined_output_directory, "combined_emails.md")
	combined_pdf_file = os.path.join(combined_output_directory, "combined_emails.pdf")
	combined_txt_file = os.path.join(combined_output_directory, "combined_emails.txt")
	combined_json_file = os.path.join(combined_output_directory, "combined_emails.json")

	email_data = []
	for filename in os.listdir(eml_directory):
	if filename.endswith(".eml"):
	eml_file = os.path.join(eml_directory, filename)
	date, email_content, subject, reply_count = extract_email_content(eml_file)
	email_data.append((date, filename, email_content, subject, reply_count))
	output_file = os.path.join(html_output_directory, os.path.splitext(filename)[0] + ".html")
	save_email_content(email_content, output_file)
	print(f"Saved content of {filename} to {output_file}")

	# Sort emails by date
	email_data.sort(key=lambda x: email.utils.parsedate_to_datetime(x[0]))

	save_all_emails_to_one_file(email_data, combined_output_file)
	print(f"Saved combined content to {combined_output_file}")

	# Convert HTML files to other formats
	for filename in os.listdir(html_output_directory):
	if filename.endswith(".html"):
	html_file = os.path.join(html_output_directory, filename)
	base_name = os.path.splitext(filename)[0]

	md_file = os.path.join(markdown_output_directory, base_name + ".md")
	convert_to_markdown(html_file, md_file)

	pdf_file = os.path.join(pdf_output_directory, base_name + ".pdf")
	convert_to_pdf(html_file, pdf_file)

	txt_file = os.path.join(text_output_directory, base_name + ".txt")
	convert_to_text(html_file, txt_file)

	json_file = os.path.join(json_output_directory, base_name + ".json")
	convert_to_json(html_file, json_file)

	# Convert combined HTML file to other formats
	convert_to_markdown(combined_output_file, combined_md_file)
	print(f"Converted combined HTML to Markdown: {combined_md_file}")

	convert_to_pdf(combined_output_file, combined_pdf_file)
	print(f"Converted combined HTML to PDF: {combined_pdf_file}")

	convert_to_text(combined_output_file, combined_txt_file)
	print(f"Converted combined HTML to text: {combined_txt_file}")

	convert_to_json(combined_output_file, combined_json_file)
	print(f"Converted combined HTML to JSON: {combined_json_file}")