Last active
November 6, 2020 19:22
-
-
Save farooqkz/4932cc1921bf365daece02e30a8f89ec to your computer and use it in GitHub Desktop.
Clean HTML tags attributes(except IMG's src and A's href) and remove DIVs. Good for converting HTMLs to Markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code by Farooq Karimi Zadeh <[email protected]> | |
# Under CC0 1.0 | |
from html.parser import HTMLParser | |
class MyParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
attrs = dict(attrs) | |
if tag == "img": | |
src = attrs["src"] if "src" in attrs else "" | |
print(f"<img src='{src}'>") | |
if tag == "a": | |
href = attrs["href"] if "href" in attrs else "" | |
print(f"<a href='{href}'>") | |
if tag != "div": | |
print(f"<{tag}>") | |
def handle_data(self, data): | |
print(data) | |
def handle_endtag(self, tag): | |
if tag != "div": | |
print(f"</{tag}>") | |
parser = MyParser() | |
parser.feed(input()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Reads from STDIN and writes to STDOUT