-
-
Save benwattsjones/060ad83efd2b3afc8b229d41f9b246c4 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python3 | |
# ~*~ utf-8 ~*~ | |
import mailbox | |
import bs4 | |
def get_html_text(html): | |
try: | |
return bs4.BeautifulSoup(html, 'lxml').body.get_text(' ', strip=True) | |
except AttributeError: # message contents empty | |
return None | |
class GmailMboxMessage(): | |
def __init__(self, email_data): | |
if not isinstance(email_data, mailbox.mboxMessage): | |
raise TypeError('Variable must be type mailbox.mboxMessage') | |
self.email_data = email_data | |
def parse_email(self): | |
email_labels = self.email_data['X-Gmail-Labels'] | |
email_date = self.email_data['Date'] | |
email_from = self.email_data['From'] | |
email_to = self.email_data['To'] | |
email_subject = self.email_data['Subject'] | |
email_text = self.read_email_payload() | |
def read_email_payload(self): | |
email_payload = self.email_data.get_payload() | |
if self.email_data.is_multipart(): | |
email_messages = list(self._get_email_messages(email_payload)) | |
else: | |
email_messages = [email_payload] | |
return [self._read_email_text(msg) for msg in email_messages] | |
def _get_email_messages(self, email_payload): | |
for msg in email_payload: | |
if isinstance(msg, (list,tuple)): | |
for submsg in self._get_email_messages(msg): | |
yield submsg | |
elif msg.is_multipart(): | |
for submsg in self._get_email_messages(msg.get_payload()): | |
yield submsg | |
else: | |
yield msg | |
def _read_email_text(self, msg): | |
content_type = 'NA' if isinstance(msg, str) else msg.get_content_type() | |
encoding = 'NA' if isinstance(msg, str) else msg.get('Content-Transfer-Encoding', 'NA') | |
if 'text/plain' in content_type and 'base64' not in encoding: | |
msg_text = msg.get_payload() | |
elif 'text/html' in content_type and 'base64' not in encoding: | |
msg_text = get_html_text(msg.get_payload()) | |
elif content_type == 'NA': | |
msg_text = get_html_text(msg) | |
else: | |
msg_text = None | |
return (content_type, encoding, msg_text) | |
######################### End of library, example of use below | |
mbox_obj = mailbox.mbox('path/to/your-mbox-file-from-gmail.mbox') | |
num_entries = len(mbox_obj) | |
for idx, email_obj in enumerate(mbox_obj): | |
email_data = GmailMboxMessage(email_obj) | |
email_data.parse_email() | |
print('Parsing email {0} of {1}'.format(idx, num_entries)) |
Thanks @redcay I added this:
self.email_from = self.email_data['From'] email_to = self.email_data['To'] self.email_to = self.email_data['To'] email_subject = self.email_data['Subject'] self.email_subject = self.email_data['Subject'] email_text = self.read_email_payload() self.email_text = self.read_email_payload()
Then I was able to the response I was looking for in
email_data.parse_email() print(email_data.email_from) print(email_data.email_to) print(email_data.email_subject) print(email_data.email_text)
is there a method to get only the text values without other data, just the email body in this method
email_text = self.read_email_payload()
email_text does not contain pure email text; there is noisy data (like css tags). Is there a method to get the pure text?
"Is there a way" yes; but it depends on what exactly you mean. A common solution to extracting just the text from HTML payloads is to run Beautifulsoup on the HTML. If you want to trim off quoted text from earlier messages in a thread, I don't know of any existing libraries for that (but that doesn't mean there aren't any). Similarly, you might want to trim signature blocks (honest-to-RFC signatures start with newline, dash, dash, space, newline; but very few modern signatures adhere to this convention).
See also:
For anyone who wants useful output out of the HTML: you'll want
msg.get_payload(decode=True).decode()
This would have saved me a lot of heartburn.
Oh goody AI generated comments... that say nothing useful