PM2Ring · June 20, 2017 14:13
diff --git a/parse_SO_chat.py b/parse_SO_chat.py
 #!/usr/bin/env python3

 ''' "Simple" parser to extract & print the messages from 
    an SO chat room transcript file or URL

    Work in progress, but usable. I think. :)

    Written by PM 2Ring 2017.06.20
 '''

 import sys
 from html.parser import HTMLParser
 import urllib.request
 from urllib.error import URLError

 # ANSI style & color numbers
 BOLD = 1
 ULINE = 4

 RED = 31
 GREEN = 32
 YELLOW = 33
 BLUE = 34
 MAGENTA = 35
 CYAN = 36
 GREY = 37

 def style(s, mode): 
    return f'\x1b[{mode}m{s}\x1b[0m'

 def bold(s): 
    return style(s, BOLD)

 REDLINE = style('- ' * 32, RED)

 #HTML tags that do not have end tags.
 unpaired = {'br', 'img', 'link', 'meta', 'hr'}

 startlbl, endlbl, datalbl = map(bold, ('START', 'END', 'DATA'))

 class Node:
    ''' A directed tree node for HTML tags
        The root of the tree is a fake tag named "root". A data elements is 
        also stored as a tag, with its text as its child.
    '''
    def __init__(self, parent, tag, attrs):
        self.parent = parent
        self.tag = tag
        self.attrs = attrs
        self.children = []
        self.nid = self.attrs.get('id', '')
        self.nclass = self.attrs.get('class', '')

    def __repr__(self):
        return f'{self.tag}{self.attrs} {len(self.children)}'

    def __getitem__(self, key):
        return self.children[key]

    def __iter__(self):
        return iter(self.children)

    def iter_tag(self, tag):
        ''' An iterator over all child nodes that match `tag` '''
        for n in self:
            if n.tag == tag:
                yield n

    def append(self, node):
        self.children.append(node)

    def show(self, depth=0):
        ''' Depth-first traversal to print a node & its children '''
        print(depth, '  '*depth, self)
        if self.tag == 'data':
            #print(repr(self.children[0]))
            return
        depth += 1
        for n in self:
            n.show(depth)

 class ParseToTree(HTMLParser):
    ''' Parse some HTML into a tree of Nodes.
        You can feed it a partial document and it will 
        raise StopIteration when the top node is closed,
        ignoring any subsequent data.
    '''
    def __init__(self, verbose=False):
        super().__init__()
        self.verbose = verbose
        self.depth = 0
        self.rootnode = Node(None, 'root', {})
        self.current = self.rootnode

    @property
    def line_head(self):
        ''' Line number and indentation '''
        return str(self.getpos()[0]).zfill(4) + '  ' * self.depth

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if self.verbose:
            print(self.line_head, startlbl, tag, attrs)
        if tag not in unpaired:
            self.depth += 1

        parent = self.current
        node = Node(parent, tag, attrs)
        parent.append(node)
        if tag not in unpaired:
            self.current = node

    def handle_endtag(self, tag):
        if tag in unpaired:
            return

        self.depth -= 1
        if self.verbose:
            print(self.line_head, endlbl, tag)

        oldtag = self.current.tag
        if tag != oldtag:
            fmt = 'Tag mismatch: Got {}, expected {}'
            print(style('ERROR', RED), fmt.format(tag, oldtag))
            raise SystemExit

        self.current = self.current.parent
        if self.current == self.rootnode:
            # There shouldn't be more data if we're back to the root
            # If we were passed a section of a document, we're at the 
            # end of that section.
            raise StopIteration

    def handle_data(self, data):
        ''' Treat data like an unpaired tag, storing
            the data string in the node's children list
        '''
        # Ignore it if it's just whitesapce
        if not data.strip():
            return
        if self.verbose:
            fmt = '{} {}\n{!r}'
            print(fmt.format(self.line_head, datalbl, data))
        parent = self.current
        node = Node(parent, 'data', {})
        parent.append(node)
        # Make the data string the node's child
        node.append(data)

 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

 # Could be methods...
 def find_by_class(node, nclass):
    ''' Do a breadth-first search for a child of node whose class is `nclass` '''
    for n in node:
        if n.nclass == nclass:
            return n
    for n in node:
        if n.tag == 'data':
            continue
        found = find_by_class(n, nclass)
        if found:
            return found

 def find_by_id(node, nid):
    ''' Do a breadth-first search for a `div` child of node whose id is `nid` '''
    for n in node.iter_tag('div'):
        if n.nid == nid:
            return n
    for n in node.iter_tag('div'):
        found = find_by_id(n, nid)
        if found:
            return found

 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

 def assemble(node):
    ''' Recursively assemble the contents of the content div ;)
        Each level of recursion yields a series of strings, which gets joined 
        into a single string by the next level up, and also by the original
        caller, `show_content`
    '''
    for n in node:
        tag = n.tag
        if tag == 'data':
            yield n[0].strip()

        elif tag == 'a':
            href = n.attrs['href']
            text = ' '.join(assemble(n))
            yield '[{}]({})'.format(style(text, BLUE), href)

        elif tag == 'code':
            text = ''.join(assemble(n))
            yield style(text, GREEN)

        elif tag == 'img':
            yield style(n.attrs['src'], BLUE)

        elif tag == 'b':
            yield '**' + ' '.join(assemble(n)) + '**'
        elif tag == 'i':
            yield '_' + ' '.join(assemble(n)) + '_'
        elif tag == 'strike':
            yield '---' + ' '.join(assemble(n)) + '---'

        elif tag == 'br':
            yield '\n'
        else:
            yield ' '.join(assemble(n))

 def show_content(node):
    ''' Display the content div '''
    print(' '.join(assemble(node))) 

 # Beware! We re-use the `node` name in the nested `for` loops 
 # of `show_message` and show_monologue. This is safe, because 
 # the name gets reset at the top of each loop.

 def show_message(node):
    ''' Display the message div '''
    msg_id = bold('Message id ') + node.nid
    if node.nclass == "message highlight":
        msg_id = style(msg_id, MAGENTA)
    print(msg_id)

    for node in node:
        if node.nclass == 'reply-info':
            reply_id = node.attrs['href'].split('#')[1]
            print(bold('Reply to'), reply_id)
        elif node.nclass == 'content':
            show_content(node)

 def show_monologue(node):
    ''' Display the monologue div '''
    user_id = node.nclass.split('-', 1)[1]
    print(bold('User id'), user_id, end=' ')

    for node in node.iter_tag('div'):
        if node.nclass == 'signature':
            username = find_by_class(node, 'username')[0][0][0]
            print(bold('User name'), username, end=' ')
        elif node.nclass == 'messages':
            for node in node.iter_tag('div'):
                if node.nclass == 'timestamp':
                    print(bold('Timestamp'), node[0][0], end=' ')
                elif node.nclass.startswith('message'):
                    show_message(node)
    print(REDLINE)

 def show_transcript(parser, verbose=False):
    ''' Show the full transcript '''
    html = parser.rootnode[0]
    head, body = html.children
    title = next(head.iter_tag('title'))
    print(bold('Title'), title[0][0])

    transcript = find_by_id(body, 'transcript')
    for node in transcript.iter_tag('div'):
        if node.nclass.startswith('monologue user'):
            if verbose:
                node.show()
                print()
            show_monologue(node)

 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

 def main():
    verbose = False
    if len(sys.argv) != 2:
        usage = ('Show SO chat transcript\nUsage:\n%s URL\n'
            'URL can be a local file, prefixed with "file:"' % sys.argv[0])
        print(usage)
        return

    url = sys.argv[1]
    try:
        with urllib.request.urlopen(url) as response:
            data = response.read().decode('utf-8')
    except URLError as err:
        print(style('ERROR', RED), err, '\n')
        return

    parser = ParseToTree(verbose=verbose)
    try:
        parser.feed(data)
        parser.close()
    except StopIteration:
        pass

    if verbose:
        print(REDLINE)
    show_transcript(parser, verbose=verbose)

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	''' "Simple" parser to extract & print the messages from
	an SO chat room transcript file or URL

	Work in progress, but usable. I think. :)

	Written by PM 2Ring 2017.06.20
	'''

	import sys
	from html.parser import HTMLParser
	import urllib.request
	from urllib.error import URLError

	# ANSI style & color numbers
	BOLD = 1
	ULINE = 4

	RED = 31
	GREEN = 32
	YELLOW = 33
	BLUE = 34
	MAGENTA = 35
	CYAN = 36
	GREY = 37

	def style(s, mode):
	return f'\x1b[{mode}m{s}\x1b[0m'

	def bold(s):
	return style(s, BOLD)

	REDLINE = style('- ' * 32, RED)

	#HTML tags that do not have end tags.
	unpaired = {'br', 'img', 'link', 'meta', 'hr'}

	startlbl, endlbl, datalbl = map(bold, ('START', 'END', 'DATA'))

	class Node:
	''' A directed tree node for HTML tags
	The root of the tree is a fake tag named "root". A data elements is
	also stored as a tag, with its text as its child.
	'''
	def __init__(self, parent, tag, attrs):
	self.parent = parent
	self.tag = tag
	self.attrs = attrs
	self.children = []
	self.nid = self.attrs.get('id', '')
	self.nclass = self.attrs.get('class', '')

	def __repr__(self):
	return f'{self.tag}{self.attrs} {len(self.children)}'

	def __getitem__(self, key):
	return self.children[key]

	def __iter__(self):
	return iter(self.children)

	def iter_tag(self, tag):
	''' An iterator over all child nodes that match `tag` '''
	for n in self:
	if n.tag == tag:
	yield n

	def append(self, node):
	self.children.append(node)

	def show(self, depth=0):
	''' Depth-first traversal to print a node & its children '''
	print(depth, ' '*depth, self)
	if self.tag == 'data':
	#print(repr(self.children[0]))
	return
	depth += 1
	for n in self:
	n.show(depth)

	class ParseToTree(HTMLParser):
	''' Parse some HTML into a tree of Nodes.
	You can feed it a partial document and it will
	raise StopIteration when the top node is closed,
	ignoring any subsequent data.
	'''
	def __init__(self, verbose=False):
	super().__init__()
	self.verbose = verbose
	self.depth = 0
	self.rootnode = Node(None, 'root', {})
	self.current = self.rootnode

	@property
	def line_head(self):
	''' Line number and indentation '''
	return str(self.getpos()[0]).zfill(4) + ' ' * self.depth

	def handle_starttag(self, tag, attrs):
	attrs = dict(attrs)
	if self.verbose:
	print(self.line_head, startlbl, tag, attrs)
	if tag not in unpaired:
	self.depth += 1

	parent = self.current
	node = Node(parent, tag, attrs)
	parent.append(node)
	if tag not in unpaired:
	self.current = node

	def handle_endtag(self, tag):
	if tag in unpaired:
	return

	self.depth -= 1
	if self.verbose:
	print(self.line_head, endlbl, tag)

	oldtag = self.current.tag
	if tag != oldtag:
	fmt = 'Tag mismatch: Got {}, expected {}'
	print(style('ERROR', RED), fmt.format(tag, oldtag))
	raise SystemExit

	self.current = self.current.parent
	if self.current == self.rootnode:
	# There shouldn't be more data if we're back to the root
	# If we were passed a section of a document, we're at the
	# end of that section.
	raise StopIteration

	def handle_data(self, data):
	''' Treat data like an unpaired tag, storing
	the data string in the node's children list
	'''
	# Ignore it if it's just whitesapce
	if not data.strip():
	return
	if self.verbose:
	fmt = '{} {}\n{!r}'
	print(fmt.format(self.line_head, datalbl, data))
	parent = self.current
	node = Node(parent, 'data', {})
	parent.append(node)
	# Make the data string the node's child
	node.append(data)

	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

	# Could be methods...
	def find_by_class(node, nclass):
	''' Do a breadth-first search for a child of node whose class is `nclass` '''
	for n in node:
	if n.nclass == nclass:
	return n
	for n in node:
	if n.tag == 'data':
	continue
	found = find_by_class(n, nclass)
	if found:
	return found

	def find_by_id(node, nid):
	''' Do a breadth-first search for a `div` child of node whose id is `nid` '''
	for n in node.iter_tag('div'):
	if n.nid == nid:
	return n
	for n in node.iter_tag('div'):
	found = find_by_id(n, nid)
	if found:
	return found

	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

	def assemble(node):
	''' Recursively assemble the contents of the content div ;)
	Each level of recursion yields a series of strings, which gets joined
	into a single string by the next level up, and also by the original
	caller, `show_content`
	'''
	for n in node:
	tag = n.tag
	if tag == 'data':
	yield n[0].strip()

	elif tag == 'a':
	href = n.attrs['href']
	text = ' '.join(assemble(n))
	yield '[{}]({})'.format(style(text, BLUE), href)

	elif tag == 'code':
	text = ''.join(assemble(n))
	yield style(text, GREEN)

	elif tag == 'img':
	yield style(n.attrs['src'], BLUE)

	elif tag == 'b':
	yield '' + ' '.join(assemble(n)) + ''
	elif tag == 'i':
	yield '_' + ' '.join(assemble(n)) + '_'
	elif tag == 'strike':
	yield '---' + ' '.join(assemble(n)) + '---'

	elif tag == 'br':
	yield '\n'
	else:
	yield ' '.join(assemble(n))

	def show_content(node):
	''' Display the content div '''
	print(' '.join(assemble(node)))

	# Beware! We re-use the `node` name in the nested `for` loops
	# of `show_message` and show_monologue. This is safe, because
	# the name gets reset at the top of each loop.

	def show_message(node):
	''' Display the message div '''
	msg_id = bold('Message id ') + node.nid
	if node.nclass == "message highlight":
	msg_id = style(msg_id, MAGENTA)
	print(msg_id)

	for node in node:
	if node.nclass == 'reply-info':
	reply_id = node.attrs['href'].split('#')[1]
	print(bold('Reply to'), reply_id)
	elif node.nclass == 'content':
	show_content(node)

	def show_monologue(node):
	''' Display the monologue div '''
	user_id = node.nclass.split('-', 1)[1]
	print(bold('User id'), user_id, end=' ')

	for node in node.iter_tag('div'):
	if node.nclass == 'signature':
	username = find_by_class(node, 'username')[0][0][0]
	print(bold('User name'), username, end=' ')
	elif node.nclass == 'messages':
	for node in node.iter_tag('div'):
	if node.nclass == 'timestamp':
	print(bold('Timestamp'), node[0][0], end=' ')
	elif node.nclass.startswith('message'):
	show_message(node)
	print(REDLINE)

	def show_transcript(parser, verbose=False):
	''' Show the full transcript '''
	html = parser.rootnode[0]
	head, body = html.children
	title = next(head.iter_tag('title'))
	print(bold('Title'), title[0][0])

	transcript = find_by_id(body, 'transcript')
	for node in transcript.iter_tag('div'):
	if node.nclass.startswith('monologue user'):
	if verbose:
	node.show()
	print()
	show_monologue(node)

	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

	def main():
	verbose = False
	if len(sys.argv) != 2:
	usage = ('Show SO chat transcript\nUsage:\n%s URL\n'
	'URL can be a local file, prefixed with "file:"' % sys.argv[0])
	print(usage)
	return

	url = sys.argv[1]
	try:
	with urllib.request.urlopen(url) as response:
	data = response.read().decode('utf-8')
	except URLError as err:
	print(style('ERROR', RED), err, '\n')
	return

	parser = ParseToTree(verbose=verbose)
	try:
	parser.feed(data)
	parser.close()
	except StopIteration:
	pass

	if verbose:
	print(REDLINE)
	show_transcript(parser, verbose=verbose)

	if __name__ == '__main__':
	main()