Created
November 27, 2017 20:58
-
-
Save nitely/9fe34f64cd300aaa92af731f5634ce8b to your computer and use it in GitHub Desktop.
Python Markdown parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright (c) 2017 by Esteban Castro Borsani. | |
# Released under MIT license | |
from .elements import ( | |
Header, | |
Quote, | |
HRule, | |
UListItem, | |
UList, | |
OListItem, | |
OList, | |
Code, | |
LinkRefLabel, | |
Paragraph, | |
Literal, | |
Link, | |
LinkRef, | |
DoubleEmphasis, | |
Emphasis, | |
CodeSpan, | |
Image, | |
ImageRef, | |
AutoLink) | |
from .elements import ( | |
default_elements, | |
default_children) | |
from .parser import ( | |
MarkDown, | |
parse, | |
render) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright (c) 2017 by Esteban Castro Borsani. | |
# Released under MIT license | |
import re | |
BLOCK_ELEMENTS = 'BLOCK_ELEMENTS' | |
INLINE_ELEMENTS = 'INLINE_ELEMENTS' | |
ALL_ELEMENTS = 'ALL_ELEMENTS' | |
U_LIST_ELEMENTS = 'U_LIST_ELEMENTS' | |
O_LIST_ELEMENTS = 'O_LIST_ELEMENTS' | |
class Element: | |
name = '' | |
patterns = () | |
children = {} | |
@staticmethod | |
def parse(match, ctx): | |
raise NotImplementedError | |
@staticmethod | |
def render(content, ctx): | |
raise NotImplementedError | |
class Header(Element): | |
name = 'header' | |
patterns = ( | |
r'^(?P<h1>[^\n]+)\n=+\n+', | |
r'^(?P<h2>[^\n]+)\n\-+\n+', | |
r'^(#{1,6})(?P<hx>[^\n]+)\n+') | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
named_groups = match.groupdict() | |
groups = match.groups() | |
if named_groups['hx']: | |
title = ( | |
groups[1] | |
.lstrip(' ') | |
.rstrip(' #')) | |
level = len(groups[0]) | |
elif named_groups['h1']: | |
title = groups[0].strip() | |
level = 1 | |
else: # h2 | |
title = groups[0].strip() | |
level = 2 | |
return { | |
'text': title, | |
'level': level} | |
@staticmethod | |
def render(content, ctx): | |
return '<h%(level)s>%(text)s</h%(level)s>\n' % content | |
class Quote(Element): | |
name = 'quote' | |
patterns = (r'^(?:>(?:[^\n]+\n)*\n+)+',) | |
children = {'text': ALL_ELEMENTS} | |
_quote_sub_pattern = re.compile(r'^> ?', flags=re.M) | |
@staticmethod | |
def parse(match, ctx): | |
return { | |
'text': Quote | |
._quote_sub_pattern | |
.sub('', match.group(0))} | |
@staticmethod | |
def render(content, ctx): | |
return '<blockquote>\n%(text)s</blockquote>\n' % content | |
class HRule(Element): | |
name = 'h_rule' | |
patterns = ( | |
r'^(?:\- ?){3,}\n+', | |
r'^(?:\* ?){3,}\n+') | |
children = {} | |
@staticmethod | |
def parse(match, ctx): | |
return {} | |
@staticmethod | |
def render(content, ctx): | |
return '<hr />\n' | |
class UListItem(Element): | |
name = 'u_list_item' | |
# patterns = (r'^[*\-+] [^\n]+\n*(?:[^*\-+\n][^\n]+\n?(?:\n {4})?)*\n*',) # Old version | |
# patterns = (r'^[*\-+] [^\n]+\n*([^*\-+][^\n]+\n*)*',) | |
patterns = (r'^[*\-+] ([^*\-+\n]\n*[^\n]+\n*)*',) | |
children = {'text': ALL_ELEMENTS} | |
_list_sub_pattern = re.compile(r'^(?:[*\-+] {1,3}| {1,4})', flags=re.M) | |
@staticmethod | |
def parse(match, ctx): | |
return { | |
'text': UListItem | |
._list_sub_pattern | |
.sub('', match.group(0))} | |
@staticmethod | |
def render(content, ctx): | |
return '<li>%s</li>\n' % content['text'].rstrip('\n') | |
@staticmethod | |
def _remove_paragraphs(children_list): | |
new_children = [] | |
for child in children_list: | |
child_token, child_children = child | |
if child_token != Paragraph.name: | |
new_children.append(child) | |
continue | |
new_children.extend(child_children['text']) | |
return new_children | |
@staticmethod | |
def _add_paragraphs(children_list): | |
new_children = [] | |
for child in children_list: | |
child_token, child_children = child | |
if child_token != '_raw_text': | |
new_children.append(child) | |
continue | |
new_children.append(( | |
Paragraph.name, | |
{'text': [child]})) | |
return new_children | |
@staticmethod | |
def post_parse(node, parent_ctx, ctx): | |
token, children = node | |
if not parent_ctx['has_loose_item']: | |
new_children_list = UListItem._remove_paragraphs(children['text']) | |
else: | |
new_children_list = UListItem._add_paragraphs(children['text']) | |
return ( | |
(token, {**children, **{'text': new_children_list}}), | |
{}) | |
class UList(Element): | |
name = 'u_list' | |
patterns = (r'^(?:[*\-+] (?:[^\n]+\n(?:\n {4})?)+\n+)+',) | |
children = {'text': U_LIST_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
# todo: check if has loose_item right here (empty line anywhere but the end) and remove post_parse | |
return {'text': match.group(0)} | |
@staticmethod | |
def render(content, ctx): | |
return '<ul>\n%(text)s</ul>\n' % content | |
@staticmethod | |
def _has_loose_item(u_list): | |
_token, children = u_list | |
*items_list, (_token_last, last_item_children) = children['text'] | |
# todo: check if any children is a block element (skip inlines) | |
# | |
# Does not matter what elements these are | |
# Skip if there's just one | |
# todo: wtf? | |
try: | |
if len(last_item_children['text']) > 1: | |
return True | |
except TypeError: | |
return False | |
return any( | |
Paragraph.name == token_child | |
for _token_item_list, item_list in items_list | |
for token_child, _child in item_list['text']) | |
@staticmethod | |
def post_parse(node, parent_ctx, ctx): | |
return node, {'has_loose_item': UList._has_loose_item(node)} | |
class OListItem(Element): | |
name = 'o_list_item' | |
# todo: benchmark | |
patterns = (r'^\d{1,9}\. [^\n]+\n(?:[^\d][^\n]+\n(?:\n {4})?)*\n*',) | |
children = {'text': ALL_ELEMENTS} | |
_list_sub_pattern = re.compile( | |
r'^(?:\d{1,9}\. {1,3}| {1,4})', flags=re.M) | |
@staticmethod | |
def parse(match, ctx): | |
return { | |
'text': OListItem | |
._list_sub_pattern | |
.sub('', match.group(0)) | |
.rstrip()} | |
@staticmethod | |
def render(content, ctx): | |
return '<li>%s</li>\n' % content['text'].rstrip('\n') | |
class OList(Element): | |
name = 'o_list' | |
patterns = (r'^(?:\d{1,9}\. (?:[^\n]+\n(?:\n {4})?)+\n+)+',) | |
children = {'text': O_LIST_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
return {'text': match.group(0)} | |
@staticmethod | |
def render(content, ctx): | |
return '<ol>\n%(text)s</ol>\n' % content | |
class Code(Element): | |
name = 'code' | |
patterns = ( | |
r'^(?: {4}[^\n]+\n+)+', | |
r'^```\n+[^`]+```\n+') | |
children = {} | |
_code_sub_pattern = re.compile(r'^( {4}|```\n*)', flags=re.M) | |
@staticmethod | |
def parse(match, ctx): | |
return { | |
'text': Code | |
._code_sub_pattern | |
.sub('', match.group(0))} | |
@staticmethod | |
def render(content, ctx): | |
return '<pre><code>%(text)s</code></pre>\n' % content | |
class LinkRefLabel(Element): | |
name = 'link_ref_label' | |
patterns = ( | |
r'^ {,3}\[([^\]]+)\]: +(?:<([^>]+)>|([^ \n]+) ?\n?) *(?:' | |
r'"([^"]+)"|\'([^\']+)\'|\(([^\)]+)\))\n+', | |
r'^ {,3}\[([^\]]+)\]: +(?:<([^>]+)>|([^\n]+)\n)\n*') | |
children = {} | |
@staticmethod | |
def parse(match, ctx): | |
groups = match.groups() | |
if len(groups) > 3: | |
title = groups[3] or groups[4] or groups[5] or '' | |
else: | |
title = '' | |
url = groups[1] or groups[2] or '' | |
ctx[groups[0].lower()] = ( | |
url.strip(), | |
title.strip()) | |
return {} | |
@staticmethod | |
def render(content, ctx): | |
return '' | |
class Paragraph(Element): | |
name = 'paragraph' | |
patterns = (r'^(?:[^\n]+\n)*\n+',) | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
# todo: use INLINE_ELEMENTS + BR? | |
# todo: use INLINE - Paragraph, so we dont strip here | |
return {'text': match.group(0).strip()} | |
@staticmethod | |
def render(content, ctx): | |
# todo: replace line ending with 2 spaces or tabs by <br> | |
return '<p>%(text)s</p>\n' % content | |
class Literal(Element): | |
name = 'literal' | |
patterns = (r'\\[`*_\{\}\[\]\(\)#+\-\.!\\]',) | |
children = {} | |
@staticmethod | |
def parse(match, ctx): | |
return {'text': match.group(0)[1]} | |
@staticmethod | |
def render(content, ctx): | |
return content['text'] | |
class Link(Element): | |
name = 'link' | |
patterns = ( | |
r'\[([^\]]+)\]\(([^ ]+) "([^"]+)"\)', | |
r'\[([^\]]+)\]\(([^\)]+)\)') | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
groups = match.groups() | |
content = { | |
'text': groups[0], | |
'link': groups[1]} | |
try: | |
content['title'] = groups[2] | |
except IndexError: | |
pass | |
return content | |
@staticmethod | |
def render(content, ctx): | |
if 'title' in content: | |
return ( | |
'<a href="%(link)s" ' | |
'title="%(title)s">%(text)s</a>' % content) | |
else: | |
return ( | |
'<a href="%(link)s">%(text)s</a>' % content) | |
class LinkRef(Element): | |
name = 'link_ref' | |
patterns = (r'\[([^\]]+)\] ?\[([^\]]*)\]',) | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
groups = match.groups() | |
return { | |
'text': groups[0], | |
'ref': (groups[1] or groups[0]).lower()} | |
@staticmethod | |
def render(content, ctx): | |
try: | |
link, title = ctx[content['ref']] | |
except KeyError: | |
return '%(text)s' % content | |
else: | |
return Link.render( | |
{'link': link, | |
'title': title, | |
'text': content['text']}, | |
ctx) | |
class DoubleEmphasis(Element): | |
name = 'double_emphasis' | |
patterns = ( | |
r'\*\*[^ ](?:[^ ]+ \*\* )*[^\*]*\*\*', | |
r'__[^ ](?:[^ ]+ __ )*[^_]*__') | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
return {'text': match.group(0)[2:-2]} | |
@staticmethod | |
def render(content, ctx): | |
return '<strong>%(text)s</strong>' % content | |
class Emphasis(Element): | |
name = 'emphasis' | |
patterns = ( | |
# ('emphasis', r'\*[^ ](?:[^ \\]+(?: \* |\\\*))*[^\*]*\*'), | |
r'\*[^ ](?:[^ ]+ \* )*[^\*]*\*', | |
r'_[^ ](?:[^ ]+ _ )*[^_]*_') | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
return {'text': match.group(0)[1:-1]} | |
@staticmethod | |
def render(content, ctx): | |
return '<em>%(text)s</em>' % content | |
class CodeSpan(Element): | |
name = 'code_span' | |
patterns = ( | |
r'``([^`]+`)+`', | |
r'`[^`]+`') | |
children = {} | |
_code_sub_pattern = re.compile(r'(?:^``? ?| ?`?`$)') | |
@staticmethod | |
def parse(match, ctx): | |
return { | |
'text': CodeSpan | |
._code_sub_pattern | |
.sub('', match.group(0))} | |
@staticmethod | |
def render(content, ctx): | |
return '<code>%(text)s</code>' % content | |
class Image(Element): | |
name = 'image' | |
patterns = ( | |
r'!\[([^\]]+)\]\(([^ ]+) "([^"]+)"\)', | |
r'!\[([^\]]+)\]\(([^\)]+)\)') | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
groups = match.groups() | |
content = { | |
'text': groups[0], | |
'link': groups[1]} | |
try: | |
content['title'] = groups[2] | |
except IndexError: | |
pass | |
return content | |
@staticmethod | |
def render(content, ctx): | |
if 'title' in content: | |
return ( | |
'<img src="%(link)s" ' | |
'title="%(title)s">%(text)s</img>' % content) | |
else: | |
return ( | |
'<img src="%(link)s">%(text)s</img>' % content) | |
class ImageRef(Element): | |
name = 'image_ref' | |
patterns = (r'!\[([^\]]+)\] ?\[([^\]]+)\]',) | |
children = {'text': INLINE_ELEMENTS} | |
@staticmethod | |
def parse(match, ctx): | |
return { | |
'text': match.group(1), | |
'ref': match.group(2)} | |
@staticmethod | |
def render(content, ctx): | |
try: | |
link, title = ctx[content['ref']] | |
except KeyError: | |
return '%(text)s' % content | |
else: | |
return Image.render( | |
{'link': link, | |
'title': title, | |
'text': content['text']}, | |
ctx) | |
class AutoLink(Element): | |
name = 'auto_link' | |
patterns = (r'<[^>]+>',) | |
children = {} | |
@staticmethod | |
def parse(match, ctx): | |
return {'link': match.group(0)[1:-1]} | |
@staticmethod | |
def render(content, ctx): | |
link = content['link'] | |
return Link.render( | |
{'link': link, | |
'text': link}, | |
ctx) | |
DEFAULT_ELEMENTS = { | |
element.name: element | |
for element in ( | |
Header, | |
Quote, | |
HRule, | |
UListItem, | |
UList, | |
OListItem, | |
OList, | |
Code, | |
LinkRefLabel, | |
Paragraph, | |
Literal, | |
Link, | |
LinkRef, | |
DoubleEmphasis, | |
Emphasis, | |
CodeSpan, | |
Image, | |
ImageRef, | |
AutoLink)} | |
def default_elements(): | |
return DEFAULT_ELEMENTS.copy() | |
def to_rules(elements): | |
return tuple( | |
(element.name, pattern) | |
for element in elements | |
for pattern in element.patterns) | |
DEFAULT_CHILDREN = { | |
BLOCK_ELEMENTS: to_rules(( | |
Header, | |
Quote, | |
HRule, | |
UList, | |
OList, | |
Code, # todo: move to top? | |
LinkRefLabel, | |
Paragraph)), | |
INLINE_ELEMENTS: to_rules(( | |
Literal, | |
Link, | |
LinkRef, | |
DoubleEmphasis, | |
Emphasis, | |
CodeSpan, | |
Image, | |
ImageRef, | |
AutoLink)), | |
U_LIST_ELEMENTS: to_rules(( | |
UListItem,)), | |
O_LIST_ELEMENTS: to_rules(( | |
OListItem,))} | |
DEFAULT_CHILDREN[ALL_ELEMENTS] = ( | |
*DEFAULT_CHILDREN[BLOCK_ELEMENTS], | |
*DEFAULT_CHILDREN[INLINE_ELEMENTS]) | |
def default_children(): | |
return DEFAULT_CHILDREN.copy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright (c) 2017 by Esteban Castro Borsani. | |
# Released under MIT license | |
import re | |
from . import scanner | |
from . import elements as elms | |
def _text_pre_process(text): | |
# todo: replace lines with only spaces and tabs by \n | |
return '%s\n\n' % ( | |
text.replace('\r\n', '\n') | |
.replace('\r', '\n') | |
.strip('\n')) | |
_scanners_cache = {} | |
def _scanner_for(rules): | |
try: | |
return _scanners_cache[rules] | |
except KeyError: | |
new_scanner = scanner.Scanner(rules, flags=re.M) | |
_scanners_cache[rules] = new_scanner | |
return new_scanner | |
_RAW_TEXT = '_raw_text' | |
def _parse(txt, ctx, acc, rules, children_map, elements, level, limit): | |
if (not rules or | |
level == limit): | |
acc.append((_RAW_TEXT, txt)) | |
return acc | |
for token, match_or_hole in _scanner_for(rules).scan_with_holes(txt): | |
if token is None: | |
acc.append((_RAW_TEXT, match_or_hole)) | |
continue | |
element = elements[token] | |
content = element.parse(match_or_hole, ctx) | |
children = { | |
ck: _parse( | |
txt=content[ck], | |
ctx=ctx, | |
acc=[], | |
rules=children_map[rule_name], | |
children_map=children_map, | |
elements=elements, | |
level=level + 1, | |
limit=limit) | |
for ck, rule_name in element.children.items()} | |
acc.append((token, {**content, **children})) | |
return acc | |
_RECURSION_LIMIT = 10 | |
def parse( | |
txt, | |
ctx, | |
children_map=elms.DEFAULT_CHILDREN, | |
elements=elms.DEFAULT_ELEMENTS, | |
limit=_RECURSION_LIMIT): | |
if limit < 1: | |
limit = 1 | |
return _parse( | |
txt=_text_pre_process(txt), | |
ctx=ctx, | |
acc=[], | |
rules=children_map[elms.ALL_ELEMENTS], | |
children_map=children_map, | |
elements=elements, | |
level=0, | |
limit=limit) | |
def _post_parse(ast, parent_ctx, ctx): | |
for index, (token, children_or_text) in enumerate(ast): | |
if token == _RAW_TEXT: | |
continue | |
element = elms.DEFAULT_ELEMENTS[token] | |
if not hasattr(element, 'post_parse'): | |
continue | |
new_node, curr_ctx = element.post_parse( | |
(token, children_or_text), | |
parent_ctx, | |
ctx) | |
ast[index] = new_node | |
for ck, cv in element.children.items(): | |
_post_parse(new_node[1][ck], curr_ctx, ctx) | |
def post_parse(ast, ctx): | |
_post_parse(ast, {}, ctx) | |
return ast | |
def _escape(text): | |
return ( | |
text.replace('&', '&') | |
.replace('&', '&') | |
.replace('<', '<') | |
.replace('>', '>')) | |
def _render(ast, ctx): | |
res = [] | |
for token, children_or_text in ast: | |
if token == _RAW_TEXT: | |
res.append(_escape(children_or_text)) | |
continue | |
element = elms.DEFAULT_ELEMENTS[token] | |
content = { | |
ck: _render(children_or_text[ck], ctx) | |
for ck, cv in element.children.items()} | |
cleaned_extra_data = { | |
name: _escape(value) | |
for name, value in children_or_text.items() | |
if (name not in element.children and | |
isinstance(value, str))} | |
res.append(element.render( | |
{**children_or_text, | |
**content, | |
**cleaned_extra_data}, | |
ctx)) | |
return ''.join(res) | |
def render(ast_or_txt, ctx=None): | |
if ctx is None: | |
ctx = {} | |
if isinstance(ast_or_txt, str): | |
ast_or_txt = post_parse( | |
parse(ast_or_txt, ctx), | |
ctx) | |
return _render(ast_or_txt, ctx) | |
class MarkDown: | |
def __init__(self): | |
self._elements = elms.default_elements() | |
self._children = elms.default_children() | |
def new_element_type(self, element_type): | |
self._children.setdefault(element_type, ()) | |
def add_element(self, element_type, element, index): | |
self._elements[element.name] = element | |
children = list(self._children[element_type]) | |
children.insert(index, element) | |
self._children[element_type] = tuple(children) | |
self._children[elms.ALL_ELEMENTS] = ( | |
*self._children[elms.INLINE_ELEMENTS], | |
*self._children[elms.BLOCK_ELEMENTS]) | |
def add_inline(self, element, index): | |
self.add_element(elms.INLINE_ELEMENTS, element, index) | |
def add_block(self, element, index): | |
self.add_element(elms.BLOCK_ELEMENTS, element, index) | |
def render(self, text, context=None, nesting_limit=_RECURSION_LIMIT): | |
if context is None: | |
context = {} | |
return render( | |
post_parse( | |
parse( | |
txt=text, | |
ctx=context, | |
children_map=self._children, | |
elements=self._elements, | |
limit=nesting_limit), | |
context), | |
context) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright (c) 2017 by Esteban Castro Borsani. | |
# | |
# Original code by Armin Ronacher. | |
# Modifications under MIT licence. | |
# Copyright (c) 2015 by Armin Ronacher. | |
# | |
# Some rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions are | |
# met: | |
# | |
# * Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# | |
# * Redistributions in binary form must reproduce the above | |
# copyright notice, this list of conditions and the following | |
# disclaimer in the documentation and/or other materials provided | |
# with the distribution. | |
# | |
# * The names of the contributors may not be used to endorse or | |
# promote products derived from this software without specific | |
# prior written permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
from sre_parse import Pattern, SubPattern, parse | |
from sre_compile import compile as sre_compile | |
from sre_constants import BRANCH, SUBPATTERN | |
__all__ = ['Scanner'] | |
class _ScanMatch: | |
def __init__(self, match, rule, start, end): | |
self._match = match | |
self._start = start | |
self._end = end | |
self._rule = rule | |
def __repr__(self): | |
return '%s<%s>' % ( | |
__class__.__name__, | |
repr(self._match.groups())) | |
def __getattr__(self, name): | |
return getattr(self._match, name) | |
def __group_proc(self, method, group): | |
if group == 0: | |
return method() | |
if isinstance(group, str): | |
return method('%s_%s' % (self._rule, group)) | |
real_group = self._start + group | |
if real_group > self._end: | |
raise IndexError('no such group') | |
return method(real_group) | |
def group(self, *groups): | |
if len(groups) in (0, 1): | |
return self.__group_proc( | |
self._match.group, | |
groups and groups[0] or 0) | |
return tuple( | |
self.__group_proc(self._match.group, group) | |
for group in groups) | |
def groupdict(self, default=None): | |
prefix = '%s_' % self._rule | |
len_prefix = len(prefix) | |
return { | |
key[len_prefix:]: value | |
for key, value in self._match.groupdict(default).items() | |
if key.startswith(prefix)} | |
def span(self, group=0): | |
return self.__group_proc(self._match.span, group) | |
def groups(self): | |
return self._match.groups()[self._start:self._end] | |
def start(self, group=0): | |
return self.__group_proc(self._match.start, group) | |
def end(self, group=0): | |
return self.__group_proc(self._match.end, group) | |
def expand(self, template): | |
raise RuntimeError('Unsupported on scan matches') | |
class Scanner: | |
""" | |
This is similar to re.Scanner.\ | |
It creates a compounded regex\ | |
pattern out of many patterns. | |
Except it ``search`` to find matches,\ | |
this is so it's possible to take\ | |
the unmatched parts of the string. | |
It prefixes groups with ``name_of_rule_``\ | |
to avoid group names clashes. GroupDicts\ | |
can still be retrieve as normal without the prefix. | |
It adjusts group indexes so they work as expected,\ | |
instead of as per the compounded regex. | |
It has a few caveats: group-index back-references\ | |
are relative to the compounded regex,\ | |
so for all practical purposes they won't work. | |
""" | |
def __init__(self, rules, flags=0): | |
pattern = Pattern() | |
pattern.flags = flags | |
for _ in range(len(rules)): | |
pattern.opengroup() | |
_og = pattern.opengroup | |
pattern.opengroup = lambda n: _og(n and '%s_%s' % (name, n) or n) | |
self.rules = [] | |
subpatterns = [] | |
subflags = set() | |
for group, (name, regex) in enumerate(rules, 1): | |
last_group = pattern.groups - 1 | |
subpattern = parse(regex, flags, pattern) | |
subpatterns.append(SubPattern(pattern, [ | |
(SUBPATTERN, (group, subpattern)), | |
])) | |
subflags.add(subpattern.pattern.flags) | |
self.rules.append((name, last_group, pattern.groups - 1)) | |
self._scanner = sre_compile(SubPattern( | |
pattern, [(BRANCH, (None, subpatterns))])).scanner | |
if len(subflags) > 1: | |
raise ValueError( | |
'In-pattern flags are not supported') | |
def _scan(self, string): | |
sc = self._scanner(string) | |
for match in iter(sc.search, None): | |
rule, start, end = self.rules[match.lastindex - 1] | |
yield rule, _ScanMatch(match, rule, start, end) | |
def scan_with_holes(self, string): | |
pos = 0 | |
for rule, match in self._scan(string): | |
hole = string[pos:match.start()] | |
if hole: | |
yield None, hole | |
yield rule, match | |
pos = match.end() | |
hole = string[pos:] | |
if hole: | |
yield None, hole |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment