Last active
March 5, 2020 15:55
-
-
Save mikehwang/f0c581b73e1f5e5b8bbecec6217841fc to your computer and use it in GitHub Desktop.
Script to list emails using IMAP for the purpose of trying to **understand** how to parse emails for an ETL project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from imaplib import IMAP4_SSL | |
import email | |
# Usage: python list_emails.py <hostname> <user> <password> | |
# https://stackoverflow.com/questions/2230037/how-to-fetch-an-email-body-using-imaplib-in-python | |
# Was very helpful meaning I probably copied code from there | |
def list_emails(hostname, user, password): | |
try: | |
with IMAP4_SSL(hostname, port=993) as im: | |
im.login(user, password) | |
# default is to select inbox | |
im.select(readonly=True) | |
(result, indices) = im.search(None, "ALL") | |
if result == "OK": | |
# Apparently its safe to assume that you will get a list of | |
# length one. I think.. | |
# Also fetch emails from latest to oldest | |
for i in reversed(indices[0].split()): | |
(result, msg) = im.fetch(i, "(RFC822)") | |
# ALL didn't turn out what I expected. See RFC | |
# https://tools.ietf.org/html/rfc2060.html#section-6.4.5 | |
#(result, msg) = im.fetch(i, "(ALL)") | |
msg = email.message_from_string(msg[0][1].decode("utf-8")) | |
print("\n".join(["MESSAGE:" | |
, msg["Subject"] | |
, msg["From"] | |
, msg["thread-index"] | |
, msg["thread-topic"] | |
, msg["message-id"] | |
])) | |
print("WALK:") | |
for part in msg.walk(): | |
# The following line printed | |
# ['policy', '_headers', '_unixfrom', '_payload', | |
# '_charset', 'preamble', 'epilogue', 'defects', | |
# '_default_type'] | |
# I was curious what else is in here.. | |
# print(list(part.__dict__.keys())) | |
print("\n".join( | |
list(map(str, [part.get_content_type(), | |
part.get("content-id"), | |
# Reveals a lot of details about this part like | |
# whether this part is an attachment, filename, | |
# filesize, etc | |
part.get("content-disposition")])) | |
)) | |
print("---") | |
from email.iterators import _structure | |
print("STRUCTURE:") | |
_structure(msg) | |
print("") | |
else: | |
print("What happened?") | |
except Exception as e: | |
print(e) | |
if __name__ == "__main__": | |
import sys | |
hostname, user, password = sys.argv[1:] | |
print("List emails: {0} {1}".format(hostname, user)) | |
list_emails(hostname, user, password) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Understanding
multipart/*
is crucial. The MIME wikipedia page has a subsection on it to understand at a concept/definition level of the different kinds of multiparts.