Last active
November 2, 2022 12:56
-
-
Save vijayanandrp/6e12bcde753794628a8f2a1c34e2af8c to your computer and use it in GitHub Desktop.
Function for reading email (*.eml only) files using python - https://informationcorners.com/read-send-emails-python/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import email | |
import smtplib | |
import mimetypes | |
from email.mime.multipart import MIMEMultipart | |
from email import encoders | |
from email.mime.audio import MIMEAudio | |
from email.mime.base import MIMEBase | |
from email.mime.image import MIMEImage | |
from email.mime.text import MIMEText | |
import ntpath | |
import nltk | |
import datetime | |
def read_email(email_file): | |
fp = open(email_file, encoding='utf8') | |
try: | |
email_dump = email.message_from_file(fp) | |
email_details = {} | |
fetch_items = ['From', 'To', 'Subject', 'Date'] | |
for key in fetch_items: | |
if key in ['From']: | |
from_ = email_dump[key] | |
email_details['Sender_Email_Address'] = ', '.join(list(set(re.findall(r'<([^<]*)>', from_)))) | |
email_details['Sender_Name'] = ', '.join(re.findall(r'"([^"]*)"', from_)) | |
elif key in ['To']: | |
to = email_dump[key] | |
email_details['Received_By_Name'] = re.sub(r'<([^<]*)>', '', to).strip() | |
email_details['Received_Email_Address'] = ', '.join(list(set(re.findall(r'<([^<]*)>', to)))) | |
elif key in ['Subject']: | |
decode = email.header.decode_header(email_dump['Subject'])[0] | |
subject = decode[0] | |
if type(subject) is bytes: | |
email_details['Subject'] = subject.decode('unicode_escape') | |
else: | |
email_details['Subject'] = subject | |
elif key in ['Date']: | |
date_stamp = re.sub(' (\+|\-).*', '', str(email_dump[key])) | |
email_details['Date'] = date_stamp | |
email_details['Date_Created'] = datetime.datetime.strptime(date_stamp.strip(), '%a, %d %b %Y %H:%M:%S') | |
message = '' | |
if email_dump.is_multipart(): | |
for part in email_dump.walk(): | |
ctype = part.get_content_type() | |
cdispo = str(part.get('Content-Disposition')) | |
# skip any text/plain (txt) attachments | |
if ctype == 'text/plain' and 'attachment' not in cdispo: | |
message = part.get_payload(decode=True) # decode | |
break | |
# not multipart - i.e. plain text, no attachments, keeping fingers crossed | |
else: | |
message = email_dump.get_payload(decode=True) | |
if type(message) is bytes: | |
message = message.decode('unicode_escape') | |
if '<html' in message: | |
message = re.sub('<[^<]+?>', '', message) | |
message = re.sub(' +', ' ', message) | |
email_details['Message'] = message | |
else: | |
if '<html' in message: | |
message = nltk.clean_html(message) | |
message = re.sub('<[^<]+?>', '', message) | |
message = re.sub(r'[^\x00-\x7F]+', ' ', message) | |
message = re.sub(' +', ' ', message) | |
email_details['Message'] = message | |
# pprint.pprint(email_details) | |
return email_details | |
except Exception as error: | |
print('Unable to read Email - ' + str(error)) | |
return {} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment