Created
February 9, 2021 11:51
-
-
Save tripleee/862acc9f239c1bd6f610d22ef341d7d0 to your computer and use it in GitHub Desktop.
Selectively remove text/html from multipart/alternative structures in email
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from email import message_from_bytes | |
| from email.iterators import typed_subpart_iterator, body_line_iterator | |
| from email.policy import default | |
| def nontrivial(textpart): | |
| """ | |
| Check that this part contains at least three lines of text. | |
| """ | |
| for idx, _ in enumerate(body_line_iterator(textpart)): | |
| if idx == 2: | |
| return True | |
| return False | |
| def scrubbed(message): | |
| """ | |
| Examine message; if it has a multipart/alternative structure | |
| with text/plain and text/html parts, and nontrivial() is True | |
| for the text/plain part, remove the text/html part and return | |
| True. Otherwise, return False. The message is modified in-place | |
| if the conditions are met. | |
| """ | |
| text = None | |
| html = None | |
| for container in typed_subpart_iterator( | |
| message, maintype='multipart', subtype='alternative'): | |
| text = None | |
| html = None | |
| for part in typed_subpart_iterator(container): | |
| ctype = part.get_content_type() | |
| if ctype == 'text/plain' and nontrivial(part): | |
| text = part | |
| elif ctype == 'text/html': | |
| html = part | |
| if text and html: | |
| # FIXME: meddles with the internals; | |
| # calls get_content_type anew | |
| for idx, payload in enumerate(container._payload): | |
| if payload.get_content_type() == 'text/html': | |
| container._payload.pop(idx) | |
| break | |
| return True | |
| return False | |
| def killhtml_maybe(filename): | |
| """ | |
| Read email from filename; replace with a scrubbed version | |
| if scrubbed() succeeds. | |
| """ | |
| with open(filename, 'rb') as handle: | |
| message = message_from_bytes(handle.read(), policy=default) | |
| if scrubbed(message): | |
| with open(filename, 'wb') as handle: | |
| handle.write(message.as_bytes()) | |
| def main(): | |
| import sys | |
| for filename in sys.argv[1:]: | |
| killhtml_maybe(filename) | |
| if __name__ == '__main__': | |
| main() |
Author
Author
Unfortunately, when we rewrite a message, the new contents are rendered from Python's internal representation, which ends up rewriting a lot of the structures so you can't e.g. easily diff against the input message and see what changed.
Author
https://stackoverflow.com/a/20250820/874188 has a healthier approach to removing attachments but I could not quickly get it to work like I wanted. I suspect I didn't try hard enough.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Inspired by https://superuser.com/questions/1624144/bulk-delete-text-html-parts-of-email-messages