Skip to content

Instantly share code, notes, and snippets.

@PM2Ring
Created June 20, 2017 14:13
Show Gist options
  • Save PM2Ring/37ba8f87a0e0b474dab4a5ccbf3b3772 to your computer and use it in GitHub Desktop.
Save PM2Ring/37ba8f87a0e0b474dab4a5ccbf3b3772 to your computer and use it in GitHub Desktop.
Display a Stack Overflow chat room transcript in the shell, with ANSI color codes. Work in progress.
#!/usr/bin/env python3
''' "Simple" parser to extract & print the messages from
an SO chat room transcript file or URL
Work in progress, but usable. I think. :)
Written by PM 2Ring 2017.06.20
'''
import sys
from html.parser import HTMLParser
import urllib.request
from urllib.error import URLError
# ANSI style & color numbers
BOLD = 1
ULINE = 4
RED = 31
GREEN = 32
YELLOW = 33
BLUE = 34
MAGENTA = 35
CYAN = 36
GREY = 37
def style(s, mode):
return f'\x1b[{mode}m{s}\x1b[0m'
def bold(s):
return style(s, BOLD)
REDLINE = style('- ' * 32, RED)
#HTML tags that do not have end tags.
unpaired = {'br', 'img', 'link', 'meta', 'hr'}
startlbl, endlbl, datalbl = map(bold, ('START', 'END', 'DATA'))
class Node:
''' A directed tree node for HTML tags
The root of the tree is a fake tag named "root". A data elements is
also stored as a tag, with its text as its child.
'''
def __init__(self, parent, tag, attrs):
self.parent = parent
self.tag = tag
self.attrs = attrs
self.children = []
self.nid = self.attrs.get('id', '')
self.nclass = self.attrs.get('class', '')
def __repr__(self):
return f'{self.tag}{self.attrs} {len(self.children)}'
def __getitem__(self, key):
return self.children[key]
def __iter__(self):
return iter(self.children)
def iter_tag(self, tag):
''' An iterator over all child nodes that match `tag` '''
for n in self:
if n.tag == tag:
yield n
def append(self, node):
self.children.append(node)
def show(self, depth=0):
''' Depth-first traversal to print a node & its children '''
print(depth, ' '*depth, self)
if self.tag == 'data':
#print(repr(self.children[0]))
return
depth += 1
for n in self:
n.show(depth)
class ParseToTree(HTMLParser):
''' Parse some HTML into a tree of Nodes.
You can feed it a partial document and it will
raise StopIteration when the top node is closed,
ignoring any subsequent data.
'''
def __init__(self, verbose=False):
super().__init__()
self.verbose = verbose
self.depth = 0
self.rootnode = Node(None, 'root', {})
self.current = self.rootnode
@property
def line_head(self):
''' Line number and indentation '''
return str(self.getpos()[0]).zfill(4) + ' ' * self.depth
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if self.verbose:
print(self.line_head, startlbl, tag, attrs)
if tag not in unpaired:
self.depth += 1
parent = self.current
node = Node(parent, tag, attrs)
parent.append(node)
if tag not in unpaired:
self.current = node
def handle_endtag(self, tag):
if tag in unpaired:
return
self.depth -= 1
if self.verbose:
print(self.line_head, endlbl, tag)
oldtag = self.current.tag
if tag != oldtag:
fmt = 'Tag mismatch: Got {}, expected {}'
print(style('ERROR', RED), fmt.format(tag, oldtag))
raise SystemExit
self.current = self.current.parent
if self.current == self.rootnode:
# There shouldn't be more data if we're back to the root
# If we were passed a section of a document, we're at the
# end of that section.
raise StopIteration
def handle_data(self, data):
''' Treat data like an unpaired tag, storing
the data string in the node's children list
'''
# Ignore it if it's just whitesapce
if not data.strip():
return
if self.verbose:
fmt = '{} {}\n{!r}'
print(fmt.format(self.line_head, datalbl, data))
parent = self.current
node = Node(parent, 'data', {})
parent.append(node)
# Make the data string the node's child
node.append(data)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Could be methods...
def find_by_class(node, nclass):
''' Do a breadth-first search for a child of node whose class is `nclass` '''
for n in node:
if n.nclass == nclass:
return n
for n in node:
if n.tag == 'data':
continue
found = find_by_class(n, nclass)
if found:
return found
def find_by_id(node, nid):
''' Do a breadth-first search for a `div` child of node whose id is `nid` '''
for n in node.iter_tag('div'):
if n.nid == nid:
return n
for n in node.iter_tag('div'):
found = find_by_id(n, nid)
if found:
return found
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def assemble(node):
''' Recursively assemble the contents of the content div ;)
Each level of recursion yields a series of strings, which gets joined
into a single string by the next level up, and also by the original
caller, `show_content`
'''
for n in node:
tag = n.tag
if tag == 'data':
yield n[0].strip()
elif tag == 'a':
href = n.attrs['href']
text = ' '.join(assemble(n))
yield '[{}]({})'.format(style(text, BLUE), href)
elif tag == 'code':
text = ''.join(assemble(n))
yield style(text, GREEN)
elif tag == 'img':
yield style(n.attrs['src'], BLUE)
elif tag == 'b':
yield '**' + ' '.join(assemble(n)) + '**'
elif tag == 'i':
yield '_' + ' '.join(assemble(n)) + '_'
elif tag == 'strike':
yield '---' + ' '.join(assemble(n)) + '---'
elif tag == 'br':
yield '\n'
else:
yield ' '.join(assemble(n))
def show_content(node):
''' Display the content div '''
print(' '.join(assemble(node)))
# Beware! We re-use the `node` name in the nested `for` loops
# of `show_message` and show_monologue. This is safe, because
# the name gets reset at the top of each loop.
def show_message(node):
''' Display the message div '''
msg_id = bold('Message id ') + node.nid
if node.nclass == "message highlight":
msg_id = style(msg_id, MAGENTA)
print(msg_id)
for node in node:
if node.nclass == 'reply-info':
reply_id = node.attrs['href'].split('#')[1]
print(bold('Reply to'), reply_id)
elif node.nclass == 'content':
show_content(node)
def show_monologue(node):
''' Display the monologue div '''
user_id = node.nclass.split('-', 1)[1]
print(bold('User id'), user_id, end=' ')
for node in node.iter_tag('div'):
if node.nclass == 'signature':
username = find_by_class(node, 'username')[0][0][0]
print(bold('User name'), username, end=' ')
elif node.nclass == 'messages':
for node in node.iter_tag('div'):
if node.nclass == 'timestamp':
print(bold('Timestamp'), node[0][0], end=' ')
elif node.nclass.startswith('message'):
show_message(node)
print(REDLINE)
def show_transcript(parser, verbose=False):
''' Show the full transcript '''
html = parser.rootnode[0]
head, body = html.children
title = next(head.iter_tag('title'))
print(bold('Title'), title[0][0])
transcript = find_by_id(body, 'transcript')
for node in transcript.iter_tag('div'):
if node.nclass.startswith('monologue user'):
if verbose:
node.show()
print()
show_monologue(node)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def main():
verbose = False
if len(sys.argv) != 2:
usage = ('Show SO chat transcript\nUsage:\n%s URL\n'
'URL can be a local file, prefixed with "file:"' % sys.argv[0])
print(usage)
return
url = sys.argv[1]
try:
with urllib.request.urlopen(url) as response:
data = response.read().decode('utf-8')
except URLError as err:
print(style('ERROR', RED), err, '\n')
return
parser = ParseToTree(verbose=verbose)
try:
parser.feed(data)
parser.close()
except StopIteration:
pass
if verbose:
print(REDLINE)
show_transcript(parser, verbose=verbose)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment