Skip to content

Instantly share code, notes, and snippets.

@nitely
Created November 27, 2017 20:58
Show Gist options
  • Save nitely/9fe34f64cd300aaa92af731f5634ce8b to your computer and use it in GitHub Desktop.
Save nitely/9fe34f64cd300aaa92af731f5634ce8b to your computer and use it in GitHub Desktop.
Python Markdown parser
# -*- coding: utf-8 -*-
# Copyright (c) 2017 by Esteban Castro Borsani.
# Released under MIT license
from .elements import (
Header,
Quote,
HRule,
UListItem,
UList,
OListItem,
OList,
Code,
LinkRefLabel,
Paragraph,
Literal,
Link,
LinkRef,
DoubleEmphasis,
Emphasis,
CodeSpan,
Image,
ImageRef,
AutoLink)
from .elements import (
default_elements,
default_children)
from .parser import (
MarkDown,
parse,
render)
# -*- coding: utf-8 -*-
# Copyright (c) 2017 by Esteban Castro Borsani.
# Released under MIT license
import re
BLOCK_ELEMENTS = 'BLOCK_ELEMENTS'
INLINE_ELEMENTS = 'INLINE_ELEMENTS'
ALL_ELEMENTS = 'ALL_ELEMENTS'
U_LIST_ELEMENTS = 'U_LIST_ELEMENTS'
O_LIST_ELEMENTS = 'O_LIST_ELEMENTS'
class Element:
name = ''
patterns = ()
children = {}
@staticmethod
def parse(match, ctx):
raise NotImplementedError
@staticmethod
def render(content, ctx):
raise NotImplementedError
class Header(Element):
name = 'header'
patterns = (
r'^(?P<h1>[^\n]+)\n=+\n+',
r'^(?P<h2>[^\n]+)\n\-+\n+',
r'^(#{1,6})(?P<hx>[^\n]+)\n+')
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
named_groups = match.groupdict()
groups = match.groups()
if named_groups['hx']:
title = (
groups[1]
.lstrip(' ')
.rstrip(' #'))
level = len(groups[0])
elif named_groups['h1']:
title = groups[0].strip()
level = 1
else: # h2
title = groups[0].strip()
level = 2
return {
'text': title,
'level': level}
@staticmethod
def render(content, ctx):
return '<h%(level)s>%(text)s</h%(level)s>\n' % content
class Quote(Element):
name = 'quote'
patterns = (r'^(?:>(?:[^\n]+\n)*\n+)+',)
children = {'text': ALL_ELEMENTS}
_quote_sub_pattern = re.compile(r'^> ?', flags=re.M)
@staticmethod
def parse(match, ctx):
return {
'text': Quote
._quote_sub_pattern
.sub('', match.group(0))}
@staticmethod
def render(content, ctx):
return '<blockquote>\n%(text)s</blockquote>\n' % content
class HRule(Element):
name = 'h_rule'
patterns = (
r'^(?:\- ?){3,}\n+',
r'^(?:\* ?){3,}\n+')
children = {}
@staticmethod
def parse(match, ctx):
return {}
@staticmethod
def render(content, ctx):
return '<hr />\n'
class UListItem(Element):
name = 'u_list_item'
# patterns = (r'^[*\-+] [^\n]+\n*(?:[^*\-+\n][^\n]+\n?(?:\n {4})?)*\n*',) # Old version
# patterns = (r'^[*\-+] [^\n]+\n*([^*\-+][^\n]+\n*)*',)
patterns = (r'^[*\-+] ([^*\-+\n]\n*[^\n]+\n*)*',)
children = {'text': ALL_ELEMENTS}
_list_sub_pattern = re.compile(r'^(?:[*\-+] {1,3}| {1,4})', flags=re.M)
@staticmethod
def parse(match, ctx):
return {
'text': UListItem
._list_sub_pattern
.sub('', match.group(0))}
@staticmethod
def render(content, ctx):
return '<li>%s</li>\n' % content['text'].rstrip('\n')
@staticmethod
def _remove_paragraphs(children_list):
new_children = []
for child in children_list:
child_token, child_children = child
if child_token != Paragraph.name:
new_children.append(child)
continue
new_children.extend(child_children['text'])
return new_children
@staticmethod
def _add_paragraphs(children_list):
new_children = []
for child in children_list:
child_token, child_children = child
if child_token != '_raw_text':
new_children.append(child)
continue
new_children.append((
Paragraph.name,
{'text': [child]}))
return new_children
@staticmethod
def post_parse(node, parent_ctx, ctx):
token, children = node
if not parent_ctx['has_loose_item']:
new_children_list = UListItem._remove_paragraphs(children['text'])
else:
new_children_list = UListItem._add_paragraphs(children['text'])
return (
(token, {**children, **{'text': new_children_list}}),
{})
class UList(Element):
name = 'u_list'
patterns = (r'^(?:[*\-+] (?:[^\n]+\n(?:\n {4})?)+\n+)+',)
children = {'text': U_LIST_ELEMENTS}
@staticmethod
def parse(match, ctx):
# todo: check if has loose_item right here (empty line anywhere but the end) and remove post_parse
return {'text': match.group(0)}
@staticmethod
def render(content, ctx):
return '<ul>\n%(text)s</ul>\n' % content
@staticmethod
def _has_loose_item(u_list):
_token, children = u_list
*items_list, (_token_last, last_item_children) = children['text']
# todo: check if any children is a block element (skip inlines)
#
# Does not matter what elements these are
# Skip if there's just one
# todo: wtf?
try:
if len(last_item_children['text']) > 1:
return True
except TypeError:
return False
return any(
Paragraph.name == token_child
for _token_item_list, item_list in items_list
for token_child, _child in item_list['text'])
@staticmethod
def post_parse(node, parent_ctx, ctx):
return node, {'has_loose_item': UList._has_loose_item(node)}
class OListItem(Element):
name = 'o_list_item'
# todo: benchmark
patterns = (r'^\d{1,9}\. [^\n]+\n(?:[^\d][^\n]+\n(?:\n {4})?)*\n*',)
children = {'text': ALL_ELEMENTS}
_list_sub_pattern = re.compile(
r'^(?:\d{1,9}\. {1,3}| {1,4})', flags=re.M)
@staticmethod
def parse(match, ctx):
return {
'text': OListItem
._list_sub_pattern
.sub('', match.group(0))
.rstrip()}
@staticmethod
def render(content, ctx):
return '<li>%s</li>\n' % content['text'].rstrip('\n')
class OList(Element):
name = 'o_list'
patterns = (r'^(?:\d{1,9}\. (?:[^\n]+\n(?:\n {4})?)+\n+)+',)
children = {'text': O_LIST_ELEMENTS}
@staticmethod
def parse(match, ctx):
return {'text': match.group(0)}
@staticmethod
def render(content, ctx):
return '<ol>\n%(text)s</ol>\n' % content
class Code(Element):
name = 'code'
patterns = (
r'^(?: {4}[^\n]+\n+)+',
r'^```\n+[^`]+```\n+')
children = {}
_code_sub_pattern = re.compile(r'^( {4}|```\n*)', flags=re.M)
@staticmethod
def parse(match, ctx):
return {
'text': Code
._code_sub_pattern
.sub('', match.group(0))}
@staticmethod
def render(content, ctx):
return '<pre><code>%(text)s</code></pre>\n' % content
class LinkRefLabel(Element):
name = 'link_ref_label'
patterns = (
r'^ {,3}\[([^\]]+)\]: +(?:<([^>]+)>|([^ \n]+) ?\n?) *(?:'
r'"([^"]+)"|\'([^\']+)\'|\(([^\)]+)\))\n+',
r'^ {,3}\[([^\]]+)\]: +(?:<([^>]+)>|([^\n]+)\n)\n*')
children = {}
@staticmethod
def parse(match, ctx):
groups = match.groups()
if len(groups) > 3:
title = groups[3] or groups[4] or groups[5] or ''
else:
title = ''
url = groups[1] or groups[2] or ''
ctx[groups[0].lower()] = (
url.strip(),
title.strip())
return {}
@staticmethod
def render(content, ctx):
return ''
class Paragraph(Element):
name = 'paragraph'
patterns = (r'^(?:[^\n]+\n)*\n+',)
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
# todo: use INLINE_ELEMENTS + BR?
# todo: use INLINE - Paragraph, so we dont strip here
return {'text': match.group(0).strip()}
@staticmethod
def render(content, ctx):
# todo: replace line ending with 2 spaces or tabs by <br>
return '<p>%(text)s</p>\n' % content
class Literal(Element):
name = 'literal'
patterns = (r'\\[`*_\{\}\[\]\(\)#+\-\.!\\]',)
children = {}
@staticmethod
def parse(match, ctx):
return {'text': match.group(0)[1]}
@staticmethod
def render(content, ctx):
return content['text']
class Link(Element):
name = 'link'
patterns = (
r'\[([^\]]+)\]\(([^ ]+) "([^"]+)"\)',
r'\[([^\]]+)\]\(([^\)]+)\)')
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
groups = match.groups()
content = {
'text': groups[0],
'link': groups[1]}
try:
content['title'] = groups[2]
except IndexError:
pass
return content
@staticmethod
def render(content, ctx):
if 'title' in content:
return (
'<a href="%(link)s" '
'title="%(title)s">%(text)s</a>' % content)
else:
return (
'<a href="%(link)s">%(text)s</a>' % content)
class LinkRef(Element):
name = 'link_ref'
patterns = (r'\[([^\]]+)\] ?\[([^\]]*)\]',)
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
groups = match.groups()
return {
'text': groups[0],
'ref': (groups[1] or groups[0]).lower()}
@staticmethod
def render(content, ctx):
try:
link, title = ctx[content['ref']]
except KeyError:
return '%(text)s' % content
else:
return Link.render(
{'link': link,
'title': title,
'text': content['text']},
ctx)
class DoubleEmphasis(Element):
name = 'double_emphasis'
patterns = (
r'\*\*[^ ](?:[^ ]+ \*\* )*[^\*]*\*\*',
r'__[^ ](?:[^ ]+ __ )*[^_]*__')
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
return {'text': match.group(0)[2:-2]}
@staticmethod
def render(content, ctx):
return '<strong>%(text)s</strong>' % content
class Emphasis(Element):
name = 'emphasis'
patterns = (
# ('emphasis', r'\*[^ ](?:[^ \\]+(?: \* |\\\*))*[^\*]*\*'),
r'\*[^ ](?:[^ ]+ \* )*[^\*]*\*',
r'_[^ ](?:[^ ]+ _ )*[^_]*_')
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
return {'text': match.group(0)[1:-1]}
@staticmethod
def render(content, ctx):
return '<em>%(text)s</em>' % content
class CodeSpan(Element):
name = 'code_span'
patterns = (
r'``([^`]+`)+`',
r'`[^`]+`')
children = {}
_code_sub_pattern = re.compile(r'(?:^``? ?| ?`?`$)')
@staticmethod
def parse(match, ctx):
return {
'text': CodeSpan
._code_sub_pattern
.sub('', match.group(0))}
@staticmethod
def render(content, ctx):
return '<code>%(text)s</code>' % content
class Image(Element):
name = 'image'
patterns = (
r'!\[([^\]]+)\]\(([^ ]+) "([^"]+)"\)',
r'!\[([^\]]+)\]\(([^\)]+)\)')
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
groups = match.groups()
content = {
'text': groups[0],
'link': groups[1]}
try:
content['title'] = groups[2]
except IndexError:
pass
return content
@staticmethod
def render(content, ctx):
if 'title' in content:
return (
'<img src="%(link)s" '
'title="%(title)s">%(text)s</img>' % content)
else:
return (
'<img src="%(link)s">%(text)s</img>' % content)
class ImageRef(Element):
name = 'image_ref'
patterns = (r'!\[([^\]]+)\] ?\[([^\]]+)\]',)
children = {'text': INLINE_ELEMENTS}
@staticmethod
def parse(match, ctx):
return {
'text': match.group(1),
'ref': match.group(2)}
@staticmethod
def render(content, ctx):
try:
link, title = ctx[content['ref']]
except KeyError:
return '%(text)s' % content
else:
return Image.render(
{'link': link,
'title': title,
'text': content['text']},
ctx)
class AutoLink(Element):
name = 'auto_link'
patterns = (r'<[^>]+>',)
children = {}
@staticmethod
def parse(match, ctx):
return {'link': match.group(0)[1:-1]}
@staticmethod
def render(content, ctx):
link = content['link']
return Link.render(
{'link': link,
'text': link},
ctx)
DEFAULT_ELEMENTS = {
element.name: element
for element in (
Header,
Quote,
HRule,
UListItem,
UList,
OListItem,
OList,
Code,
LinkRefLabel,
Paragraph,
Literal,
Link,
LinkRef,
DoubleEmphasis,
Emphasis,
CodeSpan,
Image,
ImageRef,
AutoLink)}
def default_elements():
return DEFAULT_ELEMENTS.copy()
def to_rules(elements):
return tuple(
(element.name, pattern)
for element in elements
for pattern in element.patterns)
DEFAULT_CHILDREN = {
BLOCK_ELEMENTS: to_rules((
Header,
Quote,
HRule,
UList,
OList,
Code, # todo: move to top?
LinkRefLabel,
Paragraph)),
INLINE_ELEMENTS: to_rules((
Literal,
Link,
LinkRef,
DoubleEmphasis,
Emphasis,
CodeSpan,
Image,
ImageRef,
AutoLink)),
U_LIST_ELEMENTS: to_rules((
UListItem,)),
O_LIST_ELEMENTS: to_rules((
OListItem,))}
DEFAULT_CHILDREN[ALL_ELEMENTS] = (
*DEFAULT_CHILDREN[BLOCK_ELEMENTS],
*DEFAULT_CHILDREN[INLINE_ELEMENTS])
def default_children():
return DEFAULT_CHILDREN.copy()
# -*- coding: utf-8 -*-
# Copyright (c) 2017 by Esteban Castro Borsani.
# Released under MIT license
import re
from . import scanner
from . import elements as elms
def _text_pre_process(text):
# todo: replace lines with only spaces and tabs by \n
return '%s\n\n' % (
text.replace('\r\n', '\n')
.replace('\r', '\n')
.strip('\n'))
_scanners_cache = {}
def _scanner_for(rules):
try:
return _scanners_cache[rules]
except KeyError:
new_scanner = scanner.Scanner(rules, flags=re.M)
_scanners_cache[rules] = new_scanner
return new_scanner
_RAW_TEXT = '_raw_text'
def _parse(txt, ctx, acc, rules, children_map, elements, level, limit):
if (not rules or
level == limit):
acc.append((_RAW_TEXT, txt))
return acc
for token, match_or_hole in _scanner_for(rules).scan_with_holes(txt):
if token is None:
acc.append((_RAW_TEXT, match_or_hole))
continue
element = elements[token]
content = element.parse(match_or_hole, ctx)
children = {
ck: _parse(
txt=content[ck],
ctx=ctx,
acc=[],
rules=children_map[rule_name],
children_map=children_map,
elements=elements,
level=level + 1,
limit=limit)
for ck, rule_name in element.children.items()}
acc.append((token, {**content, **children}))
return acc
_RECURSION_LIMIT = 10
def parse(
txt,
ctx,
children_map=elms.DEFAULT_CHILDREN,
elements=elms.DEFAULT_ELEMENTS,
limit=_RECURSION_LIMIT):
if limit < 1:
limit = 1
return _parse(
txt=_text_pre_process(txt),
ctx=ctx,
acc=[],
rules=children_map[elms.ALL_ELEMENTS],
children_map=children_map,
elements=elements,
level=0,
limit=limit)
def _post_parse(ast, parent_ctx, ctx):
for index, (token, children_or_text) in enumerate(ast):
if token == _RAW_TEXT:
continue
element = elms.DEFAULT_ELEMENTS[token]
if not hasattr(element, 'post_parse'):
continue
new_node, curr_ctx = element.post_parse(
(token, children_or_text),
parent_ctx,
ctx)
ast[index] = new_node
for ck, cv in element.children.items():
_post_parse(new_node[1][ck], curr_ctx, ctx)
def post_parse(ast, ctx):
_post_parse(ast, {}, ctx)
return ast
def _escape(text):
return (
text.replace('&amp;', '&')
.replace('&', '&amp;')
.replace('<', '&lt;')
.replace('>', '&gt;'))
def _render(ast, ctx):
res = []
for token, children_or_text in ast:
if token == _RAW_TEXT:
res.append(_escape(children_or_text))
continue
element = elms.DEFAULT_ELEMENTS[token]
content = {
ck: _render(children_or_text[ck], ctx)
for ck, cv in element.children.items()}
cleaned_extra_data = {
name: _escape(value)
for name, value in children_or_text.items()
if (name not in element.children and
isinstance(value, str))}
res.append(element.render(
{**children_or_text,
**content,
**cleaned_extra_data},
ctx))
return ''.join(res)
def render(ast_or_txt, ctx=None):
if ctx is None:
ctx = {}
if isinstance(ast_or_txt, str):
ast_or_txt = post_parse(
parse(ast_or_txt, ctx),
ctx)
return _render(ast_or_txt, ctx)
class MarkDown:
def __init__(self):
self._elements = elms.default_elements()
self._children = elms.default_children()
def new_element_type(self, element_type):
self._children.setdefault(element_type, ())
def add_element(self, element_type, element, index):
self._elements[element.name] = element
children = list(self._children[element_type])
children.insert(index, element)
self._children[element_type] = tuple(children)
self._children[elms.ALL_ELEMENTS] = (
*self._children[elms.INLINE_ELEMENTS],
*self._children[elms.BLOCK_ELEMENTS])
def add_inline(self, element, index):
self.add_element(elms.INLINE_ELEMENTS, element, index)
def add_block(self, element, index):
self.add_element(elms.BLOCK_ELEMENTS, element, index)
def render(self, text, context=None, nesting_limit=_RECURSION_LIMIT):
if context is None:
context = {}
return render(
post_parse(
parse(
txt=text,
ctx=context,
children_map=self._children,
elements=self._elements,
limit=nesting_limit),
context),
context)
# -*- coding: utf-8 -*-
# Copyright (c) 2017 by Esteban Castro Borsani.
#
# Original code by Armin Ronacher.
# Modifications under MIT licence.
# Copyright (c) 2015 by Armin Ronacher.
#
# Some rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# * The names of the contributors may not be used to endorse or
# promote products derived from this software without specific
# prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from sre_parse import Pattern, SubPattern, parse
from sre_compile import compile as sre_compile
from sre_constants import BRANCH, SUBPATTERN
__all__ = ['Scanner']
class _ScanMatch:
def __init__(self, match, rule, start, end):
self._match = match
self._start = start
self._end = end
self._rule = rule
def __repr__(self):
return '%s<%s>' % (
__class__.__name__,
repr(self._match.groups()))
def __getattr__(self, name):
return getattr(self._match, name)
def __group_proc(self, method, group):
if group == 0:
return method()
if isinstance(group, str):
return method('%s_%s' % (self._rule, group))
real_group = self._start + group
if real_group > self._end:
raise IndexError('no such group')
return method(real_group)
def group(self, *groups):
if len(groups) in (0, 1):
return self.__group_proc(
self._match.group,
groups and groups[0] or 0)
return tuple(
self.__group_proc(self._match.group, group)
for group in groups)
def groupdict(self, default=None):
prefix = '%s_' % self._rule
len_prefix = len(prefix)
return {
key[len_prefix:]: value
for key, value in self._match.groupdict(default).items()
if key.startswith(prefix)}
def span(self, group=0):
return self.__group_proc(self._match.span, group)
def groups(self):
return self._match.groups()[self._start:self._end]
def start(self, group=0):
return self.__group_proc(self._match.start, group)
def end(self, group=0):
return self.__group_proc(self._match.end, group)
def expand(self, template):
raise RuntimeError('Unsupported on scan matches')
class Scanner:
"""
This is similar to re.Scanner.\
It creates a compounded regex\
pattern out of many patterns.
Except it ``search`` to find matches,\
this is so it's possible to take\
the unmatched parts of the string.
It prefixes groups with ``name_of_rule_``\
to avoid group names clashes. GroupDicts\
can still be retrieve as normal without the prefix.
It adjusts group indexes so they work as expected,\
instead of as per the compounded regex.
It has a few caveats: group-index back-references\
are relative to the compounded regex,\
so for all practical purposes they won't work.
"""
def __init__(self, rules, flags=0):
pattern = Pattern()
pattern.flags = flags
for _ in range(len(rules)):
pattern.opengroup()
_og = pattern.opengroup
pattern.opengroup = lambda n: _og(n and '%s_%s' % (name, n) or n)
self.rules = []
subpatterns = []
subflags = set()
for group, (name, regex) in enumerate(rules, 1):
last_group = pattern.groups - 1
subpattern = parse(regex, flags, pattern)
subpatterns.append(SubPattern(pattern, [
(SUBPATTERN, (group, subpattern)),
]))
subflags.add(subpattern.pattern.flags)
self.rules.append((name, last_group, pattern.groups - 1))
self._scanner = sre_compile(SubPattern(
pattern, [(BRANCH, (None, subpatterns))])).scanner
if len(subflags) > 1:
raise ValueError(
'In-pattern flags are not supported')
def _scan(self, string):
sc = self._scanner(string)
for match in iter(sc.search, None):
rule, start, end = self.rules[match.lastindex - 1]
yield rule, _ScanMatch(match, rule, start, end)
def scan_with_holes(self, string):
pos = 0
for rule, match in self._scan(string):
hole = string[pos:match.start()]
if hole:
yield None, hole
yield rule, match
pos = match.end()
hole = string[pos:]
if hole:
yield None, hole
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment