Skip to content

Instantly share code, notes, and snippets.

@agosto-calvinbehling
Last active March 8, 2017 23:07
Show Gist options
  • Save agosto-calvinbehling/bf2330c7c9088d9f7ba68fc5814caecc to your computer and use it in GitHub Desktop.
Save agosto-calvinbehling/bf2330c7c9088d9f7ba68fc5814caecc to your computer and use it in GitHub Desktop.
Random bits of html parser for helping debug
from HTMLParser import HTMLParser
import json
class HtmlToJson(HTMLParser):
"""
Transforms HTML to a simplified JSON format.
"""
def __init__(self, tag_filters):
HTMLParser.__init__(self)
self._records = []
self._path = []
self._filters = tag_filters
def handle_starttag(self, tag, attrs):
active = self._records
for item in self._path:
active = active[item]
new = {
'tag': tag,
'attrs': attrs,
'children': [],
'text': [],
}
if active:
active['children'].append(new)
child_length = len(active['children'])
child_index = child_length - 1 if child_length > 0 else 0
self._path.append('children')
self._path.append(child_index)
else:
active.append(new)
length = len(active)
index = length - 1 if length > 0 else 0
self._path.append(index)
def handle_endtag(self, tag):
self._path.pop()
self._path.pop()
def handle_data(self, data):
active = self._records
for item in self._path:
active = active[item]
active['text'].append(data)
def _do_filter(self, items):
result = []
for item in items:
if item['tag'] in self._filters:
continue
children = self._do_filter(item['children'])
item['children'] = children
result.append(item)
return result
def _do_flatten(self, items):
result = []
for item in items:
if not filter(lambda x: x.strip(), item['text']):
child = self._do_flatten(item['children'])
if child:
result.append(child)
else:
if not item['attrs']:
item.pop('attrs')
children = self._do_flatten(item['children'])
item['children'] = children
if not item['children']:
item.pop('children')
result.append(item)
return result
def json(self):
filtered = self._do_filter(self._records)
flattend = self._do_flatten(filtered)
return json.dumps(flattend)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment