-
-
Save xarg/3b33e5c937e663b511a3 to your computer and use it in GitHub Desktop.
Parse email body and get only the text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import email | |
def get_decoded_email_body(message_body): | |
""" Decode email body. | |
Detect character set if the header is not set. | |
We try to get text/plain, but if there is not one then fallback to text/html. | |
:param message_body: Raw 7-bit message body input e.g. from imaplib. Double encoded in quoted-printable and latin-1 | |
:return: Message body as unicode string | |
""" | |
msg = email.message_from_string(message_body) | |
text = "" | |
if msg.is_multipart(): | |
html = None | |
for part in msg.walk(): | |
print "%s, %s" % (part.get_content_type(), part.get_content_charset()) | |
if part.get_content_charset() is None: | |
# We cannot know the character set, so return decoded "something" | |
text = part.get_payload(decode=True) | |
continue | |
charset = part.get_content_charset() | |
if part.get_content_type() == 'text/plain': | |
text = unicode(part.get_payload(decode=True), str(charset), "ignore").encode('utf8', 'replace') | |
if part.get_content_type() == 'text/html': | |
html = unicode(part.get_payload(decode=True), str(charset), "ignore").encode('utf8', 'replace') | |
if text is not None: | |
return text.strip() | |
else: | |
return html.strip() | |
else: | |
text = unicode(msg.get_payload(decode=True), msg.get_content_charset(), 'ignore').encode('utf8', 'replace') | |
return text.strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment