Skip to content

Instantly share code, notes, and snippets.

@polymorphm
Created November 6, 2012 22:09
Show Gist options
  • Save polymorphm/4027940 to your computer and use it in GitHub Desktop.
Save polymorphm/4027940 to your computer and use it in GitHub Desktop.
HTML Parser with tag-search
# -*- mode: python; coding: utf-8 -*-
#
# Copyright 2012 Andrej A Antonov <[email protected]>.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
assert str is not bytes
import weakref
from html import parser
from html import entities
class HtmlNode:
def __init__(self, parent=None):
self.set_parent(parent)
def get_parent(self):
return self._parent_ref() \
if self._parent_ref is not None else None
def set_parent(self, parent):
self._parent_ref = weakref.ref(parent) \
if parent is not None else None
class DocHtmlNode(HtmlNode):
def __init__(self):
super().__init__(parent=None)
self.decl = []
self.childs = []
class TagHtmlNode(HtmlNode):
def __init__(self, name, parent=None):
super().__init__(parent=parent)
self.name = name
self.attrs = {}
self.childs = []
class DataHtmlNode(HtmlNode):
def __init__(self, data, parent=None):
super().__init__(parent=parent)
self.data = data
class HtmlParser:
def __init__(self):
self._doc_node = DocHtmlNode()
self._curr_node = self._doc_node
class ParserHandler(parser.HTMLParser):
def handle_starttag(inner_self, tag, attrs):
self._starttag_handle(tag, attrs)
def handle_endtag(inner_self, tag):
self._endtag_handle(tag)
def handle_data(inner_self, data):
self._data_handle(data)
def handle_entityref(inner_self, name):
self._entityref_handle(name)
def handle_charref(inner_self, name):
self._charref_handle(name)
def handle_decl(inner_self, decl):
self._decl_handle(decl)
self._parser_handler = ParserHandler()
def get_node(self):
return self._doc_node
def _starttag_handle(self, tag, attrs):
parent_node = self._curr_node
while not isinstance(parent_node, DocHtmlNode) and \
not isinstance(parent_node, TagHtmlNode):
parent_node = parent_node.get_parent()
new_node = TagHtmlNode(tag, parent=parent_node)
new_node.attrs.update(attrs)
parent_node.childs.append(new_node)
self._curr_node = new_node
def _endtag_handle(self, tag):
closing_node = self._curr_node
while not isinstance(closing_node, DocHtmlNode):
if isinstance(closing_node, TagHtmlNode) and \
closing_node.name == tag:
self._curr_node = closing_node.get_parent()
return
closing_node = closing_node.get_parent()
def _data_handle(self, data):
if isinstance(self._curr_node, DataHtmlNode):
self._curr_node.data += data
return
parent_node = self._curr_node
while not isinstance(parent_node, DocHtmlNode) and \
not isinstance(parent_node, TagHtmlNode):
parent_node = parent_node.get_parent()
new_node = DataHtmlNode(data, parent=parent_node)
parent_node.childs.append(new_node)
self._curr_node = new_node
def _entityref_handle(self, name):
try:
code = entities.name2codepoint[name]
data = chr(code)
except (KeyError, ValueError, ArithmeticError):
data = '&{};'.format(name)
self._data_handle(data)
def _charref_handle(self, name):
try:
code = int(name[1:], 16) if name.startswith('x') else int(name)
data = chr(code)
except (ValueError, ArithmeticError):
data = '&#{};'.format(name)
self._data_handle(data)
def _decl_handle(self, decl):
self._doc_node.decl.append(decl)
def feed(self, data):
self._parser_handler.feed(data)
def html_parse(data):
parser = HtmlParser()
parser.feed(data)
return parser.get_node()
def print_node(node, level=None, print_func=None):
if level is None:
level = 0
if print_func is None:
print_func = print
next_level = level + 1
self_indent = level * 2
attr_indent = self_indent + 4
next_indent = self_indent + 1
if level >= 100:
print_func('{}Error: level too big'.format(' ' * self_indent))
return
if isinstance(node, DataHtmlNode):
print_func('{}DataHtmlNode: {!r}'.format(' ' * self_indent, node.data))
return
if isinstance(node, DocHtmlNode):
print_func('{}DocHtmlNode:'.format(' ' * self_indent))
print_func('{}decl: {!r}'.format(' ' * attr_indent, node.decl))
for child in node.childs:
print_node(child, level=next_level, print_func=print_func)
return
if isinstance(node, TagHtmlNode):
print_func('{}TagHtmlNode({!r}):'.format(' ' * self_indent, node.name))
print_func('{}attrs: {!r}'.format(' ' * attr_indent, node.attrs))
for child in node.childs:
print_node(child, level=next_level, print_func=print_func)
return
print_func('{}Error: unknown type'.format(' ' * self_indent))
def get_all_nodes(node_list, direct_only=None):
if direct_only is None:
direct_only = False
node_iter = iter(node_list)
next_node_list = []
while True:
for node in node_iter:
if not direct_only:
yield node
if isinstance(node, DocHtmlNode) or isinstance(node, TagHtmlNode):
if not direct_only:
next_node_list += node.childs
else:
for child_node in node.childs:
# TODO: in python 3.3 need replace this to
# ``yield from node.childs``
yield child_node
if not next_node_list:
return
node_iter = iter(next_node_list)
next_node_list = []
def find_tag(node_list,
name=None, attrs=None, in_attrs=None,
direct_only=None):
if attrs is None:
attrs = {}
if in_attrs is None:
in_attrs = {}
def check_filter(node):
if not isinstance(node, TagHtmlNode):
return False
if name is not None and name != node.name:
return False
for attr_name in attrs:
if attr_name not in node.attrs:
return False
attr_value = attrs[attr_name]
if node.attrs[attr_name] != attr_value:
return False
for attr_name in in_attrs:
if attr_name not in node.attrs:
return False
attr_value = in_attrs[attr_name]
node_attr_values = node.attrs[attr_name].split(' ')
if attr_value not in node_attr_values:
return False
return True
return filter(check_filter, get_all_nodes(node_list, direct_only=direct_only))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment