Created
June 29, 2021 22:28
-
-
Save mariocesar/6f4d376d14de747ff35aaa2d44c22d80 to your computer and use it in GitHub Desktop.
Sanitize html defining allowed tags and attributes. Using BeautifulSoup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup, Comment | |
| from typing import Dict, List, Optional, Tuple | |
| AllowedTagsType = Dict[str, Optional[List[str]]] | |
| # Example: | |
| default_allow_tags = { | |
| "p": None, | |
| "b": None, # Allow no property | |
| "i": None, | |
| "u": ["class"], # Allow just the class property | |
| "del": ["class"], | |
| "a": ["href"], # For example this will allow href property and remove the class property | |
| "mark": ["class"], | |
| "code": ["class"], | |
| } | |
| def sanitize_html( | |
| value: str, | |
| allowed_tags: AllowedTagsType = None, | |
| remove_comments: bool = True, | |
| ) -> str: | |
| if not allowed_tags: | |
| allowed_tags = default_allow_tags | |
| # We use html.parser as it doesn't add the <html><body> tags as the default html5 parser | |
| soup = BeautifulSoup(value, "html.parser") | |
| # Remove comments before keeps iteration a little bit faster | |
| comments = [ | |
| comment.extract() | |
| for comment in soup.find_all(text=lambda node: isinstance(node, Comment)) | |
| ] | |
| for node in soup.find_all(): | |
| if node.name in allowed_tags: | |
| allowed_attributes = allowed_tags[node.name] | |
| if not allowed_attributes: | |
| continue | |
| # We make a copy because we will change the attrs dictionary while looping | |
| attrs = node.attrs.copy() | |
| for attr in attrs: | |
| if attr.lower() not in allowed_attributes: | |
| node.attrs.pop(attr) | |
| else: | |
| node.extract() | |
| return soup.encode().strip().decode() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment