Skip to content

Instantly share code, notes, and snippets.

@mariocesar
Created June 29, 2021 22:28
Show Gist options
  • Select an option

  • Save mariocesar/6f4d376d14de747ff35aaa2d44c22d80 to your computer and use it in GitHub Desktop.

Select an option

Save mariocesar/6f4d376d14de747ff35aaa2d44c22d80 to your computer and use it in GitHub Desktop.
Sanitize html defining allowed tags and attributes. Using BeautifulSoup
from bs4 import BeautifulSoup, Comment
from typing import Dict, List, Optional, Tuple
AllowedTagsType = Dict[str, Optional[List[str]]]
# Example:
default_allow_tags = {
"p": None,
"b": None, # Allow no property
"i": None,
"u": ["class"], # Allow just the class property
"del": ["class"],
"a": ["href"], # For example this will allow href property and remove the class property
"mark": ["class"],
"code": ["class"],
}
def sanitize_html(
value: str,
allowed_tags: AllowedTagsType = None,
remove_comments: bool = True,
) -> str:
if not allowed_tags:
allowed_tags = default_allow_tags
# We use html.parser as it doesn't add the <html><body> tags as the default html5 parser
soup = BeautifulSoup(value, "html.parser")
# Remove comments before keeps iteration a little bit faster
comments = [
comment.extract()
for comment in soup.find_all(text=lambda node: isinstance(node, Comment))
]
for node in soup.find_all():
if node.name in allowed_tags:
allowed_attributes = allowed_tags[node.name]
if not allowed_attributes:
continue
# We make a copy because we will change the attrs dictionary while looping
attrs = node.attrs.copy()
for attr in attrs:
if attr.lower() not in allowed_attributes:
node.attrs.pop(attr)
else:
node.extract()
return soup.encode().strip().decode()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment