Created
July 4, 2019 03:21
-
-
Save angeloped/6b9140a80446a980efb0c798aa2d7be2 to your computer and use it in GitHub Desktop.
A simple HTML parser with HTMLParser module for Python 2.X.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from HTMLParser import HTMLParser | |
class MyHTMLParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
print("Start tag:", tag) | |
for attr in attrs: | |
print(" attr:", attr) | |
def handle_endtag(self, tag): | |
print("End tag :", tag) | |
def handle_data(self, data): | |
print("Data :", data) | |
def handle_comment(self, data): | |
print("Comment :", data) | |
def handle_entityref(self, name): | |
c = chr(name2codepoint[name]) | |
print("Named ent:", c) | |
def handle_charref(self, name): | |
if name.startswith('x'): | |
c = chr(int(name[1:], 16)) | |
else: | |
c = chr(int(name)) | |
print("Num ent :", c) | |
def handle_decl(self, data): | |
print("Decl :", data) | |
parser = MyHTMLParser() | |
parser.feed('<img src="python-logo.png" alt="The Python logo">') | |
''' | |
Command: python MyHTMLParser.py | |
Output: | |
('Start tag:', 'img') | |
('\t attr:', ('src', 'python-logo.png')) | |
('\t attr:', ('alt', 'The Python logo')) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment