Created
November 20, 2017 13:38
-
-
Save kbauer/946288e1f171c2b149abf6cee99d9334 to your computer and use it in GitHub Desktop.
A simple python3 script that takes html input and converts it to (an approximation in) BBCODE. Usage described in header comment. Also useful with markdown, preprocessed to html by pandoc. Allows customizing replacement rules for html-tags, which should also allow converting xml documents to some degree.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#### USAGE | |
## | |
## python3 <scriptname> INPUTFILE [ EXTRARULE ... ] | |
## | |
## Output is written to stdout. | |
## | |
## INPUTFILE must be a html file. Xml should work too, but will like | |
## require redefining all replacement rules. | |
## | |
## EXTRARULE are additional replacement rules in the format of the | |
## variable TAGRULES, which can also redefine predefined rules. Each | |
## rule is a separate shell argument and must be passed as a JSON | |
## string, e.g. | |
## | |
## python3 <scriptname> myfile.html \ | |
## '[["h2","h3"],["[SIZE=4][B]","[/B][/SIZE]"]]' \ | |
## > myfile.bbcode | |
## | |
## Uses the form [LIST][*]...[*]...[/LIST] for lists. Some recent | |
## implementations of BBCODE sadly use the more verbose | |
## [LIST][LI]...[/LI][LI]...[/LI][/LIST] syntax; For these you need to | |
## specify a corresponding EXTRARULE. | |
## | |
## When tags are encountered, for which not rule is defined, they will | |
## be omitted and reported on stderr *after* the bbcode output; The | |
## ordering of "stdout before stderr" is enforced by flushing the | |
## streams. | |
## | |
## Useful in combination with a markdown converter, e.g. | |
## | |
## pandoc --from=markdown --to=html myfile.md myfile.html | |
## python3 <scriptname> myfile.html > myfile.bbcode | |
#### TAGRULES. | |
## | |
## Format: | |
## | |
## TAGRULE = [['tag1','tag2', ...], RULE] | |
## RULE, either: | |
## 'ignore' | |
## [BEFORE_TEXT,AFTER_TEXT] | |
## | |
## In the strings inside RULE, attributes of the html tag are | |
## accessible as {NAME}, see e.g. the tag rule for 'a'. | |
TAGRULES = [ | |
[['h1'], ['\n\n[B][size=5]', '[/size][/B]\n']], | |
[['p','div'], ['','\n']], | |
[['p','div'], ['','\n']], | |
[['head'], 'ignore'], | |
[['html', 'body'], ['','']], | |
[['strong','b'], ['[B]','[/B]']], | |
[['em','i'], ['[I]','[/I]']], | |
[['pre'], ['[CODE]\n','\n[/CODE]']], | |
[['code'], ['[font=Courier New]','[/font]']], | |
[['a'], ['[URL={href}]','[/URL]']], | |
[['u'], ['[U]','[/U]']], | |
[['ul'], ['[LIST]','[/LIST]']], | |
[['li'], ['[*]', '']], | |
] | |
#### CODE | |
from lxml import html | |
from lxml import etree | |
from collections import namedtuple | |
import sys | |
import json | |
def main(inpath, tagrules_override): | |
tree = html.parse(inpath) | |
root = tree.getroot() | |
unknowntags = [] | |
out = [] | |
## Normalize tagrules | |
tagrules = dict() | |
for taglist, rule in TAGRULES + list(tagrules_override): | |
for tag in taglist: | |
tagrules[tag] = rule | |
def _recur(node): | |
if not node.tag in tagrules: | |
if node.tag == etree.Comment: | |
tagrules[node.tag] = 'ignore' | |
else: | |
tagrules[node.tag] = ('','') | |
unknowntags.append(node.tag) | |
rule = tagrules[node.tag] | |
if len(rule) == 2: | |
out.append(rule[0].format(**node.attrib)) | |
if node.text: | |
out.append(node.text) | |
for child in node.getchildren(): | |
_recur(child) | |
if child.tail: # The text after each tag e.g. '<a><br/>AFTER</a>' | |
out.append(child.tail) | |
out.append(rule[1].format(**node.attrib)) | |
elif rule == 'ignore': | |
pass | |
else: | |
raise Exception('Invalid rule', rule) | |
_recur(root) | |
outstring = ''.join(out).strip() | |
sys.stdout.write(outstring) | |
sys.stdout.write('\n') | |
sys.stdout.flush() | |
for utag in unknowntags: | |
sys.stderr.write('Unknown tag ' + repr(utag) + '\n') | |
sys.stderr.flush() | |
def tagrules_from_shell(args): | |
''' | |
Reads tagrules from a shell-argument list. | |
Each tagrule must parse a json list. | |
''' | |
for arg in args: | |
yield json.loads(arg) | |
if __name__ == '__main__': | |
main(sys.argv[1], tagrules_from_shell(sys.argv[2:])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment