Skip to content

Instantly share code, notes, and snippets.

@vijayanandrp
Last active November 2, 2022 12:56
Show Gist options
  • Save vijayanandrp/6e12bcde753794628a8f2a1c34e2af8c to your computer and use it in GitHub Desktop.
Save vijayanandrp/6e12bcde753794628a8f2a1c34e2af8c to your computer and use it in GitHub Desktop.
Function for reading email (*.eml only) files using python - https://informationcorners.com/read-send-emails-python/
# -*- coding: utf-8 -*-
import re
import email
import smtplib
import mimetypes
from email.mime.multipart import MIMEMultipart
from email import encoders
from email.mime.audio import MIMEAudio
from email.mime.base import MIMEBase
from email.mime.image import MIMEImage
from email.mime.text import MIMEText
import ntpath
import nltk
import datetime
def read_email(email_file):
fp = open(email_file, encoding='utf8')
try:
email_dump = email.message_from_file(fp)
email_details = {}
fetch_items = ['From', 'To', 'Subject', 'Date']
for key in fetch_items:
if key in ['From']:
from_ = email_dump[key]
email_details['Sender_Email_Address'] = ', '.join(list(set(re.findall(r'<([^<]*)>', from_))))
email_details['Sender_Name'] = ', '.join(re.findall(r'"([^"]*)"', from_))
elif key in ['To']:
to = email_dump[key]
email_details['Received_By_Name'] = re.sub(r'<([^<]*)>', '', to).strip()
email_details['Received_Email_Address'] = ', '.join(list(set(re.findall(r'<([^<]*)>', to))))
elif key in ['Subject']:
decode = email.header.decode_header(email_dump['Subject'])[0]
subject = decode[0]
if type(subject) is bytes:
email_details['Subject'] = subject.decode('unicode_escape')
else:
email_details['Subject'] = subject
elif key in ['Date']:
date_stamp = re.sub(' (\+|\-).*', '', str(email_dump[key]))
email_details['Date'] = date_stamp
email_details['Date_Created'] = datetime.datetime.strptime(date_stamp.strip(), '%a, %d %b %Y %H:%M:%S')
message = ''
if email_dump.is_multipart():
for part in email_dump.walk():
ctype = part.get_content_type()
cdispo = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if ctype == 'text/plain' and 'attachment' not in cdispo:
message = part.get_payload(decode=True) # decode
break
# not multipart - i.e. plain text, no attachments, keeping fingers crossed
else:
message = email_dump.get_payload(decode=True)
if type(message) is bytes:
message = message.decode('unicode_escape')
if '<html' in message:
message = re.sub('<[^<]+?>', '', message)
message = re.sub(' +', ' ', message)
email_details['Message'] = message
else:
if '<html' in message:
message = nltk.clean_html(message)
message = re.sub('<[^<]+?>', '', message)
message = re.sub(r'[^\x00-\x7F]+', ' ', message)
message = re.sub(' +', ' ', message)
email_details['Message'] = message
# pprint.pprint(email_details)
return email_details
except Exception as error:
print('Unable to read Email - ' + str(error))
return {}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment