Created
April 15, 2025 08:38
-
-
Save yeiichi/26950bf73064384272c0d812a6271194 to your computer and use it in GitHub Desktop.
Clean a BeautifulSoup object by removing ruby-related annotations.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from bs4 import BeautifulSoup | |
# Constant for ruby-related annotation tags | |
RUBY_ANNOTATION_TAGS = ['rt', 'rp'] | |
def clean_ruby_annotations(soup: BeautifulSoup) -> BeautifulSoup: | |
""" | |
Cleans a BeautifulSoup object by removing ruby-related annotations. | |
This function removes all `<rt>` and `<rp>` tags, and unwraps `<ruby>` tags, | |
preserving their base content. It modifies the structure of the HTML to | |
eliminate ruby annotation elements while keeping other content intact. | |
Args: | |
soup (BeautifulSoup): The BeautifulSoup object representing a parsed HTML document. | |
Returns: | |
BeautifulSoup: The modified BeautifulSoup object with ruby annotations cleaned. | |
""" | |
remove_ruby_tags(soup) | |
unwrap_ruby_tags(soup) | |
return soup | |
def remove_ruby_tags(html_doc: BeautifulSoup) -> None: | |
""" Removes `<rt>` and `<rp>` tags from the HTML document. """ | |
for annotation_tag in html_doc.find_all(RUBY_ANNOTATION_TAGS): | |
annotation_tag.decompose() | |
def unwrap_ruby_tags(html_doc: BeautifulSoup) -> None: | |
""" Unwraps `<ruby>` tags, preserving their inner content. """ | |
for ruby_tag in html_doc.find_all('ruby'): | |
ruby_tag.unwrap() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment