Created
February 1, 2012 01:57
-
-
Save benui-dev/1714501 to your computer and use it in GitHub Desktop.
Parse wiki markup with mwlib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from mwlib.refine import compat,core | |
| from mwlib.parser import nodes | |
| import pprint | |
| def main(): | |
| pp = pprint.PrettyPrinter(indent=2) | |
| raw = """ | |
| =Cake= | |
| Modern cake, especially layer cakes, normally contain a | |
| combination of [[flour]], [[sugar]], [[egg (food)|eggs]], and [[butter]]. | |
| ==Varieties== | |
| This is a subheading. | |
| ===Chocolate=== | |
| Oh and another nested subheading | |
| =Related Cakes= | |
| * Jaffa cakes!? | |
| * This related section is not a subsection | |
| Cool | |
| """ | |
| # Returns raw 'token' classes that aren't much use | |
| #r = core.parse_txt(raw) | |
| ### This is the magic | |
| # Returns nicer classes, Paragraph, PreFormatted etc. | |
| parsed = compat.parse_txt(raw) | |
| ### Everything below here is just as an example | |
| f = open('test.txt', 'w') | |
| for section in parsed: | |
| recursive_parse(f, section, 0) | |
| f.close | |
| def recursive_parse(f, node, indent): | |
| tabs = indent * " " | |
| indent += 1 | |
| print tabs + node.__class__.__name__ | |
| if isinstance(node, nodes.Section): | |
| level = u"=" * node.level | |
| title = level + node.children[0].asText() | |
| title = title.rstrip() | |
| f.write(title) | |
| # Skip the text child, move onto the rest | |
| for child in node.children[1:]: | |
| recursive_parse(f, child, indent) | |
| return | |
| elif isinstance(node, nodes.Table): | |
| # Don't process children | |
| print tabs + "Table: Skipping" | |
| return | |
| elif isinstance(node, nodes.Text): | |
| txt = node.asText() | |
| if txt != "\n": # and re.match("^\s+$", txt) == None: | |
| txt = txt.rstrip('\n') | |
| #re.sub("\n", '', txt) | |
| #f.write('<text>' + txt.encode('UTF-8') + '</text>') | |
| f.write(txt.encode('UTF-8')) | |
| else: | |
| f.write("???") | |
| if node.children is not None: | |
| for child in node.children: | |
| recursive_parse(f, child, indent) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment