Created
June 20, 2017 14:00
-
-
Save PM2Ring/949539473ae56eb7bfb7d709512ea896 to your computer and use it in GitHub Desktop.
Show the structure of an HTML file (or part thereof) in the shell
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' Show HTML file structure | |
Written by PM 2Ring 2017.06.19 | |
''' | |
import sys | |
from html.parser import HTMLParser | |
#HTML tags that do not have end tags. | |
unpaired = {'br', 'img', 'link', 'meta', 'hr', 'p', 'input'} | |
def bold(s): | |
''' Wrap `s` in ANSI escape sequences to make it bold ''' | |
return '\x1b[1m%s\x1b[0m' % s | |
labels = ('START', 'END', 'DATA', 'COMMENT') | |
startlbl, endlbl, datalbl, commentlbl = map(bold, labels) | |
class DumpHTML(HTMLParser): | |
''' Dump HTML tags & data, with depth indicated by indentation ''' | |
def __init__(self): | |
super().__init__() | |
self.depth = 0 | |
@property | |
def line_head(self): | |
''' Line number and indentation ''' | |
return str(self.getpos()[0]).zfill(4) + ' ' * self.depth | |
def handle_starttag(self, tag, attrs): | |
''' Print a start tag ''' | |
print(self.line_head, startlbl, tag, dict(attrs)) | |
if tag not in unpaired: | |
self.depth += 1 | |
def handle_endtag(self, tag): | |
''' Print an end tag, unless it's an unpaired tag ''' | |
if tag not in unpaired: | |
self.depth -= 1 | |
print(self.line_head, endlbl, tag) | |
if self.depth < 0: | |
raise StopIteration | |
def handle_data(self, data): | |
''' Print data, unless it's all whitespace ''' | |
if data.strip(): | |
print(self.line_head, datalbl, repr(data)) | |
def handle_comment(self, comment): | |
''' Print comment ''' | |
print(self.line_head, commentlbl, repr(comment)) | |
def main(): | |
if len(sys.argv) != 2: | |
print('HTML file dumper\nUsage:\n%s filename' % sys.argv[0]) | |
sys.exit() | |
with open(sys.argv[1]) as f: | |
data = f.read() | |
parser = DumpHTML() | |
try: | |
parser.feed(data) | |
parser.close() | |
except StopIteration: | |
msg = 'Data found at a higher nesting level than the starting data; skipping.' | |
print(bold('Warning:'), msg) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment