Skip to content

Instantly share code, notes, and snippets.

@facelessuser
Last active September 17, 2024 16:45
Show Gist options
  • Save facelessuser/a6613237425b78e843c32268ef464e2e to your computer and use it in GitHub Desktop.
Save facelessuser/a6613237425b78e843c32268ef464e2e to your computer and use it in GitHub Desktop.
Prototype: Fancy Lists for Python Mardkwon
"""
Fancy lists in the style of Pandoc.
---
# A Python implementation of John Gruber's Markdown.
# Started by Manfred Stienstra (http://www.dwerg.net/).
# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
# Currently maintained by Waylan Limberg (https://github.com/waylan),
# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
# Copyright 2004 Manfred Stienstra (the original version)
# License: BSD (see LICENSE.md for details).
---
Adapted to support "fancy" behavior by Copyright 2024 Isaac Muse.
Work in progress, not fully tested.
"""
from markdown import Extension
from markdown.blockprocessors import BlockProcessor
from markdown.treeprocessors import Treeprocessor
import xml.etree.ElementTree as etree
import re
VALID_ROMAN = re.compile(r'M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I[XV]|V?I{0,3})')
ROMAN_MAP = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
def roman2int(s):
"""Convert Roman numeral to integer."""
s = s.upper()
if not s or VALID_ROMAN.match(s) is None:
raise ValueError('invalid')
# Initialize result
total = 0
i = 0
while i < len(s):
# Current index is less than the next, subtract current from next and sum value
if i + 1 < len(s) and ROMAN_MAP[s[i]] < ROMAN_MAP[s[i + 1]]:
total += ROMAN_MAP[s[i + 1]] - ROMAN_MAP[s[i]]
i += 2
# Sum the value
else:
total += ROMAN_MAP[s[i]]
i += 1
return total
class FancyOListProcessor(BlockProcessor):
"""Process fancy ordered list blocks. """
TAG = 'ol'
SIBLING_TAGS = ['ol']
LAZY_OL = False
TYPES = {
'dot-hash': '1',
'paren-hash': '1',
'dot-num': '1',
'paren-num': '1',
'dot-roman': 'i',
'paren-roman': 'i',
'dot-ROMAN': 'I',
'paren-ROMAN': 'I',
'dot-alpha': 'a',
'paren-alpha': 'a',
'dot-ALPHA': 'A',
'paren-ALPHA': 'A'
}
def __init__(self, parser):
"""Initialize."""
super().__init__(parser)
# Detect an item (`1. item`). `group(1)` contains contents of item.
self.list_re = re.compile(
r'''
^[ ]{0,%d}
(?:
(?:
\d+ |
\# |
M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I[XV]|V?I{0,3}) |
m{0,3}(?:c[md]|d?c{0,3})(?:x[cl]|l?x{0,3})(?:i[xv]|v?i{0,3}) |
[a-z] |
[A-Z](?=[.)][ ]{2})
)
[).]
)
[ ]+(.*)
''' % (self.tab_length - 1),
re.VERBOSE
)
# Detect items on secondary lines. they can be of either list type.
self.child_re = re.compile(
r'''
^[ ]{0,%d}
((
(?:
(?:
\d+ |
\# |
M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I[XV]|V?I{0,3}) |
m{0,3}(?:c[md]|d?c{0,3})(?:x[cl]|l?x{0,3})(?:i[xv]|v?i{0,3}) |
[a-z] |
[A-Z](?=[.)][ ]{2})
)
[).] |
[-*+]
)
))[ ]+(.*)
''' % (self.tab_length - 1),
re.VERBOSE
)
# Detect indented (nested) items of either type
self.indent_re = re.compile(
r'''
^[ ]{%d,%d}
(
(
(?:
(?:
\d+ |
\# |
(?:M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I[XV]|V?I{0,3})){2,} |
(?:m{0,3}(?:c[md]|d?c{0,3})(?:x[cl]|l?x{0,3})(?:i[xv]|v?i{0,3})){2,} |
[a-z] |
[A-Z](?=[.)][ ]{2})
)
[).] |
[-*+]
)
)
)[ ]+.*
''' % (self.tab_length, self.tab_length * 2 - 1),
re.VERBOSE
)
self.startswith = "1"
def test(self, parent, block):
"""Test to see if block starts with a list."""
return bool(self.list_re.match(block))
def run(self, parent, blocks):
"""Process list items."""
sibling = self.lastChild(parent)
# Check for multiple items in one block and get the ordered list fancy type.
items, fancy_type = self.get_items(sibling, blocks.pop(0), blocks)
# Append list items that are under the sibling list if the list type matches
if (
sibling is not None and sibling.tag in self.SIBLING_TAGS and
sibling.attrib.get('__fancylist', '') == fancy_type
):
# Previous block was a list item, so set that as parent
lst = sibling
# make sure previous item is in a `p` - if the item has text,
# then it isn't in a `p`
if lst[-1].text:
# since it's possible there are other children for this
# sibling, we can't just `SubElement` the `p`, we need to
# insert it as the first item.
p = etree.Element('p')
p.text = lst[-1].text
lst[-1].text = ''
lst[-1].insert(0, p)
# if the last item has a tail, then the tail needs to be put in a `p`
# likely only when a header is not followed by a blank line
lch = self.lastChild(lst[-1])
if lch is not None and lch.tail:
p = etree.SubElement(lst[-1], 'p')
p.text = lch.tail.lstrip()
lch.tail = ''
# parse first block differently as it gets wrapped in a `p`.
li = etree.SubElement(lst, 'li')
self.parser.state.set('looselist')
firstitem = items.pop(0)
self.parser.parseBlocks(li, [firstitem])
self.parser.state.reset()
# this catches the edge case of a multi-item indented list whose
# first item is in a blank parent-list item:
# * * subitem1
# * subitem2
# see also `ListIndentProcessor`
elif parent.tag in ['ol', 'ul']:
lst = parent
# This is a new, unique list so create parent with appropriate tag.
else:
if self.TAG == 'ol':
lst = etree.SubElement(parent, self.TAG, {'type': self.TYPES[fancy_type], '__fancylist': fancy_type})
else:
lst = etree.SubElement(parent, self.TAG)
# Check if a custom start integer is set
if not self.LAZY_OL and self.startswith != '1':
lst.attrib['start'] = self.startswith
# Set the parse set to list
self.parser.state.set('list')
# Loop through items in block, recursively parsing each with the appropriate parent.
for item in items:
# Item is indented. Parse with last item as parent
if item.startswith(' '*self.tab_length):
self.parser.parseBlocks(lst[-1], [item])
# New item. Create `li` and parse with it as parent
else:
li = etree.SubElement(lst, 'li')
self.parser.parseBlocks(li, [item])
# Rest the parse state
self.parser.state.reset()
def get_start(self, fancy_type, m):
"""Translate list convention into a logical start."""
t = fancy_type.split('-')[1].lower()
if t == 'hash':
return '1'
elif t == 'num':
return m.group(1)[:-1].lstrip('(')
elif t == 'roman':
return str(roman2int(m.group(1)[:-1]))
elif t == 'alpha':
return str(ord(m.group(1)[:-1].upper()) - 64)
def get_fancy_type(self, m, first, fancy_type):
"""Get the fancy type for a given list item."""
value = m.group(1)[:-1]
sep = m.group(1)[-1]
list_type = ''
# Determine list type convention: _., _), (_)
if sep == '.':
list_type += 'dot-'
elif sep == ')':
if value.startswith('('):
list_type += 'fullparen-'
value = value[1:]
else:
list_type += 'paren-'
else:
return list_type
# Determine numbering: numerical, roman numerical, alphabetic, or `#` numerical placeholder.
if value == '#':
list_type += 'hash'
elif value.isdigit():
list_type += 'num'
elif len(value) == 1 and value.isalpha():
if value.islower():
if first and (value not in 'ivxlcdm' or ((list_type + 'roman') != fancy_type and value != 'i')):
list_type += 'alpha'
elif not first and (list_type + 'alpha') == fancy_type:
list_type += 'alpha'
else:
list_type += 'roman'
elif value.isupper():
if first and (value not in 'IVXLCDM' or ((list_type + 'ROMAN') != fancy_type and value != 'I')):
list_type += 'ALPHA'
elif not first and (list_type + 'ALPHA') == fancy_type:
list_type += 'ALPHA'
else:
list_type += 'ROMAN'
elif value.isupper() and VALID_ROMAN.match(value[:-1]):
list_type += 'ROMAN'
elif value.islower() and VALID_ROMAN.match(value[:-1].upper()):
list_type += 'roman'
else:
list_type = ''
return list_type
def get_items(self, sibling, block, blocks):
"""Break a block into list items."""
# Get ordered list fancy type
fancy_type = ''
if self.TAG == 'ol':
if sibling is not None and sibling.tag in self.SIBLING_TAGS:
fancy_type = sibling.attrib.get('__fancylist', '')
fancy = fancy_type
items = []
rest = []
for line in block.split('\n'):
# We've found a list type that differs form the our current,
# so gather the rest to be processed separately.
if rest:
rest.append(line)
continue
# Child list items
m = self.child_re.match(line)
if m:
# This is a new list item check first item for the start index.
# Also check for list items that differ from the first.
fancy = self.get_fancy_type(m, not items, fancy)
# We found a different fancy type, so handle these separately
if items and fancy != fancy_type:
rest.append(line)
continue
# Detect the integer value of first list item
if not items and self.TAG == 'ol':
self.startswith = self.get_start(fancy, m)
fancy_type = fancy
# Append to the list
items.append(m.group(3))
# Indented, possibly nested content
elif self.indent_re.match(line):
# Previous item was indented. Append to that item.
if items[-1].startswith(' ' * self.tab_length):
items[-1] = '{}\n{}'.format(items[-1], line)
# Other indented content
else:
items.append(line)
# Append non list items to previous list item.
else:
items[-1] = '{}\n{}'.format(items[-1], line)
# Insert non-list items back into the blocks to be parsed later
if rest:
blocks.insert(0, '\n'.join(rest))
return items, fancy_type
class FancyUListProcessor(FancyOListProcessor):
"""Process unordered list blocks."""
SIBLING_TAGS = ['ul']
TAG = 'ul'
def __init__(self, parser):
"""Initialize."""
super().__init__(parser)
self.list_re = re.compile(r'^[ ]{0,%d}[-+*][ ]+(.*)' % (self.tab_length - 1))
class FancyListTreeprocessor(Treeprocessor):
"""Clean up fancy list metadata."""
def run(self, root):
"""Remove intermediate fancy list type metadata."""
for ol in root.iter('ol'):
if '__fancylist' in ol.attrib:
del ol.attrib['__fancylist']
return root
class FancyListExtension(Extension):
"""HTML Blocks Extension."""
def extendMarkdown(self, md):
"""Add Details to Markdown instance."""
md.registerExtension(self)
md.parser.blockprocessors.register(FancyOListProcessor(md.parser), 'olist', 40)
md.parser.blockprocessors.register(FancyUListProcessor(md.parser), 'ulist', 30)
md.treeprocessors.register(FancyListTreeprocessor(md), "olist-cleanup", 10)
def makeExtension(*args, **kwargs):
"""Return extension."""
return FancyListExtension(*args, **kwargs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment