Skip to content

Instantly share code, notes, and snippets.

@angeloped
Created July 4, 2019 03:21
Show Gist options
  • Save angeloped/6b9140a80446a980efb0c798aa2d7be2 to your computer and use it in GitHub Desktop.
Save angeloped/6b9140a80446a980efb0c798aa2d7be2 to your computer and use it in GitHub Desktop.
A simple HTML parser with HTMLParser module for Python 2.X.
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print("Start tag:", tag)
for attr in attrs:
print(" attr:", attr)
def handle_endtag(self, tag):
print("End tag :", tag)
def handle_data(self, data):
print("Data :", data)
def handle_comment(self, data):
print("Comment :", data)
def handle_entityref(self, name):
c = chr(name2codepoint[name])
print("Named ent:", c)
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
print("Num ent :", c)
def handle_decl(self, data):
print("Decl :", data)
parser = MyHTMLParser()
parser.feed('<img src="python-logo.png" alt="The Python logo">')
'''
Command: python MyHTMLParser.py
Output:
('Start tag:', 'img')
('\t attr:', ('src', 'python-logo.png'))
('\t attr:', ('alt', 'The Python logo'))
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment