Last active
March 8, 2017 23:07
-
-
Save agosto-calvinbehling/bf2330c7c9088d9f7ba68fc5814caecc to your computer and use it in GitHub Desktop.
Random bits of html parser for helping debug
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from HTMLParser import HTMLParser | |
import json | |
class HtmlToJson(HTMLParser): | |
""" | |
Transforms HTML to a simplified JSON format. | |
""" | |
def __init__(self, tag_filters): | |
HTMLParser.__init__(self) | |
self._records = [] | |
self._path = [] | |
self._filters = tag_filters | |
def handle_starttag(self, tag, attrs): | |
active = self._records | |
for item in self._path: | |
active = active[item] | |
new = { | |
'tag': tag, | |
'attrs': attrs, | |
'children': [], | |
'text': [], | |
} | |
if active: | |
active['children'].append(new) | |
child_length = len(active['children']) | |
child_index = child_length - 1 if child_length > 0 else 0 | |
self._path.append('children') | |
self._path.append(child_index) | |
else: | |
active.append(new) | |
length = len(active) | |
index = length - 1 if length > 0 else 0 | |
self._path.append(index) | |
def handle_endtag(self, tag): | |
self._path.pop() | |
self._path.pop() | |
def handle_data(self, data): | |
active = self._records | |
for item in self._path: | |
active = active[item] | |
active['text'].append(data) | |
def _do_filter(self, items): | |
result = [] | |
for item in items: | |
if item['tag'] in self._filters: | |
continue | |
children = self._do_filter(item['children']) | |
item['children'] = children | |
result.append(item) | |
return result | |
def _do_flatten(self, items): | |
result = [] | |
for item in items: | |
if not filter(lambda x: x.strip(), item['text']): | |
child = self._do_flatten(item['children']) | |
if child: | |
result.append(child) | |
else: | |
if not item['attrs']: | |
item.pop('attrs') | |
children = self._do_flatten(item['children']) | |
item['children'] = children | |
if not item['children']: | |
item.pop('children') | |
result.append(item) | |
return result | |
def json(self): | |
filtered = self._do_filter(self._records) | |
flattend = self._do_flatten(filtered) | |
return json.dumps(flattend) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment