Skip to content

Instantly share code, notes, and snippets.

@kevenli
Created December 6, 2018 01:11
Show Gist options
  • Save kevenli/fa827830021d58be9ea8186bc739caa0 to your computer and use it in GitHub Desktop.
Save kevenli/fa827830021d58be9ea8186bc739caa0 to your computer and use it in GitHub Desktop.
Parse POP3 email message
def mail_body_decode(message):
for part in email.header.decode_header(message):
binary, charset = part
if isinstance(binary, str):
yield str(binary)
else:
dammit = UnicodeDammit(binary, ['utf8', 'gb2312'] + [charset])
decoded_str = dammit.unicode_markup
yield decoded_str
def get_message_part(message_part, key):
existed = list(filter(lambda x : x[0].lower() == key.lower(), message_part._headers))
if not existed:
return None
return existed[0][1]
def parse_attachment(message_part):
content_disposition = get_message_part(message_part, 'Content-Disposition')
if content_disposition:
content_disposition = ''.join(mail_body_decode(content_disposition))
content_type = get_message_part(message_part, "Content-Type")
content_type = ''.join(mail_body_decode(content_type))
dispositions = content_disposition.strip().split(";")
content_types = content_type.strip().split(";")
if bool(content_disposition and dispositions[0].lower() == "attachment"):
file_data = message_part.get_payload(decode=True)
attachment = BytesIO(file_data)
attachment.content_type = message_part.get_content_type()
attachment.size = len(file_data)
attachment.name = None
attachment.create_date = None
attachment.mod_date = None
attachment.read_date = None
for param in dispositions[1:] + content_types[1:]:
name,value = param.split("=", 1)
name = name.lower().strip()
value = value.strip()
if name == 'name':
attachment.name = value.strip('"')
elif name == "filename":
attachment.name = value.strip('"')
elif name == "create-date":
attachment.create_date = value #TODO: datetime
elif name == "modification-date":
attachment.mod_date = value #TODO: datetime
elif name == "read-date":
attachment.read_date = value #TODO: datetime
return attachment
return None
def parse_mail(raw_email):
dammit = UnicodeDammit(raw_email, ["utf8", "gb2312"])
logger.debug(dammit.tried_encodings)
raw_email_str = dammit.unicode_markup
header_message = email.message_from_string(raw_email_str,
policy=email.policy.EmailPolicy(utf8=True))
message_id = header_message.get('Message-ID')
message_date = header_message.get('Date')
message_from = ''.join(mail_body_decode(header_message.get('From')))
message_to = header_message.get('To')
message_subject = header_message.get('Subject')
message_date = datetime.fromtimestamp(mktime(parsedate(header_message['Date'])))
subject = ''.join(mail_body_decode(message_subject))
logger.debug(subject)
mail = Mail()
mail.mail_message_id = message_id
mail.mail_date = message_date
mail.mail_from = message_from
mail.mail_to = message_to
mail.subject = subject
mail.mail_message_id_hash = hashlib.md5(message_id.encode('utf8')).hexdigest()
return mail, header_message
def extract_mail_attachements(message):
for part in message.walk():
attachment = parse_attachment(part)
if attachment:
yield attachment.name, part.get_payload(decode=True)
@kevenli
Copy link
Author

kevenli commented Dec 6, 2018

Tried chardet, it detected wrong if mail is in "GB2312" encoding as "ISO-9958-1", but UnicodeDammit works good.

@kevenli
Copy link
Author

kevenli commented Dec 6, 2018

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment