kbauer · November 20, 2017 13:38
diff --git a/html2bbcode.py b/html2bbcode.py
 #!/usr/bin/env python3

 #### USAGE
 ##
 ##     python3 <scriptname> INPUTFILE [ EXTRARULE ... ]
 ## 
 ## Output is written to stdout.
 ##
 ## INPUTFILE must be a html file. Xml should work too, but will like
 ## require redefining all replacement rules.
 ## 
 ## EXTRARULE are additional replacement rules in the format of the
 ## variable TAGRULES, which can also redefine predefined rules. Each
 ## rule is a separate shell argument and must be passed as a JSON
 ## string, e.g.
 ## 
 ##     python3 <scriptname> myfile.html \
 ##         '[["h2","h3"],["[SIZE=4][B]","[/B][/SIZE]"]]' \
 ##         > myfile.bbcode
 ## 
 ## Uses the form [LIST][*]...[*]...[/LIST] for lists. Some recent
 ## implementations of BBCODE sadly use the more verbose
 ## [LIST][LI]...[/LI][LI]...[/LI][/LIST] syntax; For these you need to
 ## specify a corresponding EXTRARULE.
 ## 
 ## When tags are encountered, for which not rule is defined, they will
 ## be omitted and reported on stderr *after* the bbcode output; The
 ## ordering of "stdout before stderr" is enforced by flushing the
 ## streams.
 ## 
 ## Useful in combination with a markdown converter, e.g. 
 ## 
 ##     pandoc --from=markdown --to=html myfile.md myfile.html
 ##     python3 <scriptname> myfile.html > myfile.bbcode


 #### TAGRULES. 
 ## 
 ## Format:
 ## 
 ## TAGRULE = [['tag1','tag2', ...], RULE]
 ## RULE, either:
 ##    'ignore'
 ##    [BEFORE_TEXT,AFTER_TEXT]
 ## 
 ## In the strings inside RULE, attributes of the html tag are
 ## accessible as {NAME}, see e.g. the tag rule for 'a'.
 TAGRULES = [
    [['h1'], ['\n\n[B][size=5]', '[/size][/B]\n']],
    [['p','div'], ['','\n']],
    [['p','div'], ['','\n']],
    [['head'], 'ignore'],
    [['html', 'body'], ['','']],
    [['strong','b'], ['[B]','[/B]']],
    [['em','i'], ['[I]','[/I]']],
    [['pre'], ['[CODE]\n','\n[/CODE]']],
    [['code'], ['[font=Courier New]','[/font]']],
    [['a'], ['[URL={href}]','[/URL]']],
    [['u'], ['[U]','[/U]']],
    [['ul'], ['[LIST]','[/LIST]']],
    [['li'], ['[*]', '']],
 ]

 #### CODE 
 from lxml import html
 from lxml import etree
 from collections import namedtuple
 import sys
 import json


 def main(inpath, tagrules_override):
    tree = html.parse(inpath)
    root = tree.getroot()

    unknowntags = []
    out = []
    ## Normalize tagrules
    tagrules = dict()
    for taglist, rule in TAGRULES + list(tagrules_override):
        for tag in taglist:
            tagrules[tag] = rule

    
    def _recur(node):
        if not node.tag in tagrules:
            if node.tag == etree.Comment:
                tagrules[node.tag] = 'ignore'
            else:
                tagrules[node.tag] = ('','')
                unknowntags.append(node.tag)

        rule = tagrules[node.tag]
        if len(rule) == 2:
            out.append(rule[0].format(**node.attrib))
            if node.text:
                out.append(node.text)
            for child in node.getchildren():
                _recur(child)
                if child.tail:  # The text after each tag e.g. '<a><br/>AFTER</a>'
                    out.append(child.tail)
            out.append(rule[1].format(**node.attrib))
        elif rule == 'ignore':
            pass
        else:
            raise Exception('Invalid rule', rule)

    _recur(root)
    outstring = ''.join(out).strip()
    
    sys.stdout.write(outstring)
    sys.stdout.write('\n')
    sys.stdout.flush()
    for utag in unknowntags:
        sys.stderr.write('Unknown tag ' + repr(utag) + '\n')
    sys.stderr.flush()


 def tagrules_from_shell(args):
    '''
    Reads tagrules from a shell-argument list.
    Each tagrule must parse a json list.
    '''
    for arg in args:
        yield json.loads(arg)
            


 if __name__ == '__main__':
    main(sys.argv[1], tagrules_from_shell(sys.argv[2:]))
	#!/usr/bin/env python3

	#### USAGE
	##
	## python3 <scriptname> INPUTFILE [ EXTRARULE ... ]
	##
	## Output is written to stdout.
	##
	## INPUTFILE must be a html file. Xml should work too, but will like
	## require redefining all replacement rules.
	##
	## EXTRARULE are additional replacement rules in the format of the
	## variable TAGRULES, which can also redefine predefined rules. Each
	## rule is a separate shell argument and must be passed as a JSON
	## string, e.g.
	##
	## python3 <scriptname> myfile.html \
	## '[["h2","h3"],["[SIZE=4][B]","[/B][/SIZE]"]]' \
	## > myfile.bbcode
	##
	## Uses the form [LIST][]...[]...[/LIST] for lists. Some recent
	## implementations of BBCODE sadly use the more verbose
	## [LIST][LI]...[/LI][LI]...[/LI][/LIST] syntax; For these you need to
	## specify a corresponding EXTRARULE.
	##
	## When tags are encountered, for which not rule is defined, they will
	## be omitted and reported on stderr after the bbcode output; The
	## ordering of "stdout before stderr" is enforced by flushing the
	## streams.
	##
	## Useful in combination with a markdown converter, e.g.
	##
	## pandoc --from=markdown --to=html myfile.md myfile.html
	## python3 <scriptname> myfile.html > myfile.bbcode


	#### TAGRULES.
	##
	## Format:
	##
	## TAGRULE = [['tag1','tag2', ...], RULE]
	## RULE, either:
	## 'ignore'
	## [BEFORE_TEXT,AFTER_TEXT]
	##
	## In the strings inside RULE, attributes of the html tag are
	## accessible as {NAME}, see e.g. the tag rule for 'a'.
	TAGRULES = [
	[['h1'], ['\n\n[B][size=5]', '[/size][/B]\n']],
	[['p','div'], ['','\n']],
	[['p','div'], ['','\n']],
	[['head'], 'ignore'],
	[['html', 'body'], ['','']],
	[['strong','b'], ['[B]','[/B]']],
	[['em','i'], ['[I]','[/I]']],
	[['pre'], ['[CODE]\n','\n[/CODE]']],
	[['code'], ['[font=Courier New]','[/font]']],
	[['a'], ['[URL={href}]','[/URL]']],
	[['u'], ['[U]','[/U]']],
	[['ul'], ['[LIST]','[/LIST]']],
	[['li'], ['[*]', '']],
	]

	#### CODE
	from lxml import html
	from lxml import etree
	from collections import namedtuple
	import sys
	import json


	def main(inpath, tagrules_override):
	tree = html.parse(inpath)
	root = tree.getroot()

	unknowntags = []
	out = []
	## Normalize tagrules
	tagrules = dict()
	for taglist, rule in TAGRULES + list(tagrules_override):
	for tag in taglist:
	tagrules[tag] = rule


	def _recur(node):
	if not node.tag in tagrules:
	if node.tag == etree.Comment:
	tagrules[node.tag] = 'ignore'
	else:
	tagrules[node.tag] = ('','')
	unknowntags.append(node.tag)

	rule = tagrules[node.tag]
	if len(rule) == 2:
	out.append(rule[0].format(**node.attrib))
	if node.text:
	out.append(node.text)
	for child in node.getchildren():
	_recur(child)
	if child.tail: # The text after each tag e.g. '<a><br/>AFTER</a>'
	out.append(child.tail)
	out.append(rule[1].format(**node.attrib))
	elif rule == 'ignore':
	pass
	else:
	raise Exception('Invalid rule', rule)

	_recur(root)
	outstring = ''.join(out).strip()

	sys.stdout.write(outstring)
	sys.stdout.write('\n')
	sys.stdout.flush()
	for utag in unknowntags:
	sys.stderr.write('Unknown tag ' + repr(utag) + '\n')
	sys.stderr.flush()


	def tagrules_from_shell(args):
	'''
	Reads tagrules from a shell-argument list.
	Each tagrule must parse a json list.
	'''
	for arg in args:
	yield json.loads(arg)



	if __name__ == '__main__':
	main(sys.argv[1], tagrules_from_shell(sys.argv[2:]))