Created
June 20, 2017 14:13
-
-
Save PM2Ring/37ba8f87a0e0b474dab4a5ccbf3b3772 to your computer and use it in GitHub Desktop.
Display a Stack Overflow chat room transcript in the shell, with ANSI color codes. Work in progress.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' "Simple" parser to extract & print the messages from | |
an SO chat room transcript file or URL | |
Work in progress, but usable. I think. :) | |
Written by PM 2Ring 2017.06.20 | |
''' | |
import sys | |
from html.parser import HTMLParser | |
import urllib.request | |
from urllib.error import URLError | |
# ANSI style & color numbers | |
BOLD = 1 | |
ULINE = 4 | |
RED = 31 | |
GREEN = 32 | |
YELLOW = 33 | |
BLUE = 34 | |
MAGENTA = 35 | |
CYAN = 36 | |
GREY = 37 | |
def style(s, mode): | |
return f'\x1b[{mode}m{s}\x1b[0m' | |
def bold(s): | |
return style(s, BOLD) | |
REDLINE = style('- ' * 32, RED) | |
#HTML tags that do not have end tags. | |
unpaired = {'br', 'img', 'link', 'meta', 'hr'} | |
startlbl, endlbl, datalbl = map(bold, ('START', 'END', 'DATA')) | |
class Node: | |
''' A directed tree node for HTML tags | |
The root of the tree is a fake tag named "root". A data elements is | |
also stored as a tag, with its text as its child. | |
''' | |
def __init__(self, parent, tag, attrs): | |
self.parent = parent | |
self.tag = tag | |
self.attrs = attrs | |
self.children = [] | |
self.nid = self.attrs.get('id', '') | |
self.nclass = self.attrs.get('class', '') | |
def __repr__(self): | |
return f'{self.tag}{self.attrs} {len(self.children)}' | |
def __getitem__(self, key): | |
return self.children[key] | |
def __iter__(self): | |
return iter(self.children) | |
def iter_tag(self, tag): | |
''' An iterator over all child nodes that match `tag` ''' | |
for n in self: | |
if n.tag == tag: | |
yield n | |
def append(self, node): | |
self.children.append(node) | |
def show(self, depth=0): | |
''' Depth-first traversal to print a node & its children ''' | |
print(depth, ' '*depth, self) | |
if self.tag == 'data': | |
#print(repr(self.children[0])) | |
return | |
depth += 1 | |
for n in self: | |
n.show(depth) | |
class ParseToTree(HTMLParser): | |
''' Parse some HTML into a tree of Nodes. | |
You can feed it a partial document and it will | |
raise StopIteration when the top node is closed, | |
ignoring any subsequent data. | |
''' | |
def __init__(self, verbose=False): | |
super().__init__() | |
self.verbose = verbose | |
self.depth = 0 | |
self.rootnode = Node(None, 'root', {}) | |
self.current = self.rootnode | |
@property | |
def line_head(self): | |
''' Line number and indentation ''' | |
return str(self.getpos()[0]).zfill(4) + ' ' * self.depth | |
def handle_starttag(self, tag, attrs): | |
attrs = dict(attrs) | |
if self.verbose: | |
print(self.line_head, startlbl, tag, attrs) | |
if tag not in unpaired: | |
self.depth += 1 | |
parent = self.current | |
node = Node(parent, tag, attrs) | |
parent.append(node) | |
if tag not in unpaired: | |
self.current = node | |
def handle_endtag(self, tag): | |
if tag in unpaired: | |
return | |
self.depth -= 1 | |
if self.verbose: | |
print(self.line_head, endlbl, tag) | |
oldtag = self.current.tag | |
if tag != oldtag: | |
fmt = 'Tag mismatch: Got {}, expected {}' | |
print(style('ERROR', RED), fmt.format(tag, oldtag)) | |
raise SystemExit | |
self.current = self.current.parent | |
if self.current == self.rootnode: | |
# There shouldn't be more data if we're back to the root | |
# If we were passed a section of a document, we're at the | |
# end of that section. | |
raise StopIteration | |
def handle_data(self, data): | |
''' Treat data like an unpaired tag, storing | |
the data string in the node's children list | |
''' | |
# Ignore it if it's just whitesapce | |
if not data.strip(): | |
return | |
if self.verbose: | |
fmt = '{} {}\n{!r}' | |
print(fmt.format(self.line_head, datalbl, data)) | |
parent = self.current | |
node = Node(parent, 'data', {}) | |
parent.append(node) | |
# Make the data string the node's child | |
node.append(data) | |
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
# Could be methods... | |
def find_by_class(node, nclass): | |
''' Do a breadth-first search for a child of node whose class is `nclass` ''' | |
for n in node: | |
if n.nclass == nclass: | |
return n | |
for n in node: | |
if n.tag == 'data': | |
continue | |
found = find_by_class(n, nclass) | |
if found: | |
return found | |
def find_by_id(node, nid): | |
''' Do a breadth-first search for a `div` child of node whose id is `nid` ''' | |
for n in node.iter_tag('div'): | |
if n.nid == nid: | |
return n | |
for n in node.iter_tag('div'): | |
found = find_by_id(n, nid) | |
if found: | |
return found | |
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
def assemble(node): | |
''' Recursively assemble the contents of the content div ;) | |
Each level of recursion yields a series of strings, which gets joined | |
into a single string by the next level up, and also by the original | |
caller, `show_content` | |
''' | |
for n in node: | |
tag = n.tag | |
if tag == 'data': | |
yield n[0].strip() | |
elif tag == 'a': | |
href = n.attrs['href'] | |
text = ' '.join(assemble(n)) | |
yield '[{}]({})'.format(style(text, BLUE), href) | |
elif tag == 'code': | |
text = ''.join(assemble(n)) | |
yield style(text, GREEN) | |
elif tag == 'img': | |
yield style(n.attrs['src'], BLUE) | |
elif tag == 'b': | |
yield '**' + ' '.join(assemble(n)) + '**' | |
elif tag == 'i': | |
yield '_' + ' '.join(assemble(n)) + '_' | |
elif tag == 'strike': | |
yield '---' + ' '.join(assemble(n)) + '---' | |
elif tag == 'br': | |
yield '\n' | |
else: | |
yield ' '.join(assemble(n)) | |
def show_content(node): | |
''' Display the content div ''' | |
print(' '.join(assemble(node))) | |
# Beware! We re-use the `node` name in the nested `for` loops | |
# of `show_message` and show_monologue. This is safe, because | |
# the name gets reset at the top of each loop. | |
def show_message(node): | |
''' Display the message div ''' | |
msg_id = bold('Message id ') + node.nid | |
if node.nclass == "message highlight": | |
msg_id = style(msg_id, MAGENTA) | |
print(msg_id) | |
for node in node: | |
if node.nclass == 'reply-info': | |
reply_id = node.attrs['href'].split('#')[1] | |
print(bold('Reply to'), reply_id) | |
elif node.nclass == 'content': | |
show_content(node) | |
def show_monologue(node): | |
''' Display the monologue div ''' | |
user_id = node.nclass.split('-', 1)[1] | |
print(bold('User id'), user_id, end=' ') | |
for node in node.iter_tag('div'): | |
if node.nclass == 'signature': | |
username = find_by_class(node, 'username')[0][0][0] | |
print(bold('User name'), username, end=' ') | |
elif node.nclass == 'messages': | |
for node in node.iter_tag('div'): | |
if node.nclass == 'timestamp': | |
print(bold('Timestamp'), node[0][0], end=' ') | |
elif node.nclass.startswith('message'): | |
show_message(node) | |
print(REDLINE) | |
def show_transcript(parser, verbose=False): | |
''' Show the full transcript ''' | |
html = parser.rootnode[0] | |
head, body = html.children | |
title = next(head.iter_tag('title')) | |
print(bold('Title'), title[0][0]) | |
transcript = find_by_id(body, 'transcript') | |
for node in transcript.iter_tag('div'): | |
if node.nclass.startswith('monologue user'): | |
if verbose: | |
node.show() | |
print() | |
show_monologue(node) | |
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
def main(): | |
verbose = False | |
if len(sys.argv) != 2: | |
usage = ('Show SO chat transcript\nUsage:\n%s URL\n' | |
'URL can be a local file, prefixed with "file:"' % sys.argv[0]) | |
print(usage) | |
return | |
url = sys.argv[1] | |
try: | |
with urllib.request.urlopen(url) as response: | |
data = response.read().decode('utf-8') | |
except URLError as err: | |
print(style('ERROR', RED), err, '\n') | |
return | |
parser = ParseToTree(verbose=verbose) | |
try: | |
parser.feed(data) | |
parser.close() | |
except StopIteration: | |
pass | |
if verbose: | |
print(REDLINE) | |
show_transcript(parser, verbose=verbose) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment