Created
November 21, 2009 03:31
-
-
Save dlo/239976 to your computer and use it in GitHub Desktop.
Textile 2.1.3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
PyTextile | |
A Humane Web Text Generator | |
""" | |
__version__ = '2.1.3' | |
__date__ = '2009/02/07' | |
__copyright__ = """ | |
Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ | |
Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ | |
Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/ | |
Original PHP Version: | |
Copyright (c) 2003-2004, Dean Allen <[email protected]> | |
All rights reserved. | |
Thanks to Carlo Zottmann <[email protected]> for refactoring | |
Textile's procedural code into a class framework | |
Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/ | |
""" | |
__license__ = """ | |
L I C E N S E | |
============= | |
Redistribution and use in source and binary forms, with or without | |
modification, are permitted provided that the following conditions are met: | |
* Redistributions of source code must retain the above copyright notice, | |
this list of conditions and the following disclaimer. | |
* Redistributions in binary form must reproduce the above copyright notice, | |
this list of conditions and the following disclaimer in the documentation | |
and/or other materials provided with the distribution. | |
* Neither the name Textile nor the names of its contributors may be used to | |
endorse or promote products derived from this software without specific | |
prior written permission. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
POSSIBILITY OF SUCH DAMAGE. | |
""" | |
import re | |
import uuid | |
from urlparse import urlparse | |
import sgmllib | |
def _normalize_newlines(string): | |
import re | |
out = re.sub(r'\r\n', '\n', string) | |
out = re.sub(r'\n{3,}', '\n\n', out) | |
out = re.sub(r'\n\s*\n', '\n\n', out) | |
out = re.sub(r'"$', '" ', out) | |
return out | |
# PyTextile can optionally sanitize the generated XHTML, | |
# which is good for weblog comments. This code is from | |
# Mark Pilgrim's feedparser. | |
class _BaseHTMLProcessor(sgmllib.SGMLParser): | |
elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', | |
'img', 'input', 'isindex', 'link', 'meta', 'param'] | |
def __init__(self): | |
sgmllib.SGMLParser.__init__(self) | |
def reset(self): | |
self.pieces = [] | |
sgmllib.SGMLParser.reset(self) | |
def normalize_attrs(self, attrs): | |
# utility method to be called by descendants | |
attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs] | |
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] | |
return attrs | |
def unknown_starttag(self, tag, attrs): | |
# called for each start tag | |
# attrs is a list of (attr, value) tuples | |
# e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")] | |
strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs]) | |
if tag in self.elements_no_end_tag: | |
self.pieces.append("<%(tag)s%(strattrs)s />" % locals()) | |
else: | |
self.pieces.append("<%(tag)s%(strattrs)s>" % locals()) | |
def unknown_endtag(self, tag): | |
# called for each end tag, e.g. for </pre>, tag will be "pre" | |
# Reconstruct the original end tag. | |
if tag not in self.elements_no_end_tag: | |
self.pieces.append("</%(tag)s>" % locals()) | |
def handle_charref(self, ref): | |
# called for each character reference, e.g. for " ", ref will be "160" | |
# Reconstruct the original character reference. | |
self.pieces.append("&#%(ref)s;" % locals()) | |
def handle_entityref(self, ref): | |
# called for each entity reference, e.g. for "©", ref will be "copy" | |
# Reconstruct the original entity reference. | |
self.pieces.append("&%(ref)s;" % locals()) | |
def handle_data(self, text): | |
# called for each block of plain text, i.e. outside of any tag and | |
# not containing any character or entity references | |
# Store the original text verbatim. | |
self.pieces.append(text) | |
def handle_comment(self, text): | |
# called for each HTML comment, e.g. <!-- insert Javascript code here --> | |
# Reconstruct the original comment. | |
self.pieces.append("<!--%(text)s-->" % locals()) | |
def handle_pi(self, text): | |
# called for each processing instruction, e.g. <?instruction> | |
# Reconstruct original processing instruction. | |
self.pieces.append("<?%(text)s>" % locals()) | |
def handle_decl(self, text): | |
# called for the DOCTYPE, if present, e.g. | |
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" | |
# "http://www.w3.org/TR/html4/loose.dtd"> | |
# Reconstruct original DOCTYPE | |
self.pieces.append("<!%(text)s>" % locals()) | |
def output(self): | |
"""Return processed HTML as a single string""" | |
return "".join(self.pieces) | |
class _HTMLSanitizer(_BaseHTMLProcessor): | |
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', | |
'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', | |
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', | |
'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', | |
'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', | |
'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', | |
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', | |
'thead', 'tr', 'tt', 'u', 'ul', 'var'] | |
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', | |
'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', | |
'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', | |
'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', | |
'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', | |
'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', | |
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', | |
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', | |
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', | |
'usemap', 'valign', 'value', 'vspace', 'width'] | |
unacceptable_elements_with_end_tag = ['script', 'applet'] | |
# This if for MathML. | |
mathml_elements = ['math', 'mi', 'mn', 'mo', 'mrow', 'msup'] | |
mathml_attributes = ['mode', 'xmlns'] | |
acceptable_elements = acceptable_elements + mathml_elements | |
acceptable_attributes = acceptable_attributes + mathml_attributes | |
def reset(self): | |
_BaseHTMLProcessor.reset(self) | |
self.unacceptablestack = 0 | |
def unknown_starttag(self, tag, attrs): | |
if not tag in self.acceptable_elements: | |
if tag in self.unacceptable_elements_with_end_tag: | |
self.unacceptablestack += 1 | |
return | |
attrs = self.normalize_attrs(attrs) | |
attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] | |
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs) | |
def unknown_endtag(self, tag): | |
if not tag in self.acceptable_elements: | |
if tag in self.unacceptable_elements_with_end_tag: | |
self.unacceptablestack -= 1 | |
return | |
_BaseHTMLProcessor.unknown_endtag(self, tag) | |
def handle_pi(self, text): | |
pass | |
def handle_decl(self, text): | |
pass | |
def handle_data(self, text): | |
if not self.unacceptablestack: | |
_BaseHTMLProcessor.handle_data(self, text) | |
# PyTextile can optionally validate the generated | |
# XHTML code using either mxTidy or uTidyLib. | |
try: | |
# This is mxTidy. | |
from mx.Tidy import Tidy | |
def _tidy1(text): | |
"""mxTidy's XHTML validator. | |
This function is a wrapper to mxTidy's validator. | |
""" | |
nerrors, nwarnings, text, errortext = Tidy.tidy(text, output_xhtml=1, numeric_entities=1, wrap=0) | |
return _in_tag(text, 'body') | |
_tidy = _tidy1 | |
except ImportError: | |
try: | |
# This is uTidyLib. | |
import tidy | |
def _tidy2(text): | |
"""uTidyLib's XHTML validator. | |
This function is a wrapper to uTidyLib's validator. | |
""" | |
text = tidy.parseString(text, output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0) | |
return _in_tag(str(text), 'body') | |
_tidy = _tidy2 | |
except ImportError: | |
_tidy = None | |
class Textile(object): | |
hlgn = r'(?:\<(?!>)|(?<!<)\>|\<\>|\=|[()]+(?! ))' | |
vlgn = r'[\-^~]' | |
clas = r'(?:\([^)]+\))' | |
lnge = r'(?:\[[^\]]+\])' | |
styl = r'(?:\{[^}]+\})' | |
cspn = r'(?:\\\d+)' | |
rspn = r'(?:\/\d+)' | |
a = r'(?:%s|%s)*' % (hlgn, vlgn) | |
s = r'(?:%s|%s)*' % (cspn, rspn) | |
c = r'(?:%s)*' % '|'.join([clas, styl, lnge, hlgn]) | |
pnct = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]' | |
# urlch = r'[\w"$\-_.+!*\'(),";/?:@=&%#{}|\\^~\[\]`]' | |
urlch = '[\w"$\-_.+*\'(),";\/?:@=&%#{}|\\^~\[\]`]' | |
url_schemes = ('http','https','ftp','mailto') | |
btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p') | |
noimage = False | |
hu = '' | |
glyph_defaults = ( | |
('txt_quote_single_open', '‘'), | |
('txt_quote_single_close', '’'), | |
('txt_quote_double_open', '“'), | |
('txt_quote_double_close', '”'), | |
('txt_apostrophe', '’'), | |
('txt_prime', '′'), | |
('txt_prime_double', '″'), | |
('txt_ellipsis', '…'), | |
('txt_emdash', '—'), | |
('txt_endash', '–'), | |
('txt_dimension', '×'), | |
('txt_trademark', '™'), | |
('txt_registered', '®'), | |
('txt_copyright', '©'), | |
) | |
def __init__(self, restricted=False, lite=False): | |
"""docstring for __init__""" | |
self.restricted = restricted | |
self.lite = lite | |
self.fn = {} | |
self.urlrefs = {} | |
self.shelf = {} | |
self.rel = '' | |
def textile(self, text, rel=None, encoding='utf8', output='utf8', validate=False, sanitize=False, head_offset='ignored'): | |
""" | |
>>> import textile | |
>>> textile.textile('some textile') | |
'\\t<p>some textile</p>' | |
""" | |
text = _normalize_newlines(text) | |
if rel: | |
self.rel = ' rel="%s"' % rel | |
text = self.getRefs(text) | |
if not self.lite: | |
text = self.block(text) | |
text = self.retrieve(text) | |
# Convert to desired output. | |
if isinstance(text, str): | |
text = unicode(text, encoding) | |
text = text.encode(output, 'xmlcharrefreplace') | |
# Sanitize? | |
if sanitize: | |
p = _HTMLSanitizer() | |
p.feed(text) | |
text = p.output() | |
# Validate output. | |
if _tidy and validate: | |
text = _tidy(text) | |
return text | |
def pba(self, input, element=None): | |
""" | |
>>> t = Textile() | |
>>> t.pba(r'\3') | |
'' | |
>>> t.pba(r'\\3', element='td') | |
' colspan="3"' | |
>>> t.pba(r'/4', element='td') | |
' rowspan="4"' | |
>>> t.pba(r'\\3/4', element='td') | |
' colspan="3" rowspan="4"' | |
>>> t.vAlign('^') | |
'top' | |
>>> t.pba('^', element='td') | |
' style="vertical-align:top;"' | |
>>> t.pba('{line-height:18px}') | |
' style="line-height:18px;"' | |
>>> t.pba('(foo-bar)') | |
' class="foo-bar"' | |
>>> t.pba('(#myid)') | |
' id="myid"' | |
>>> t.pba('(foo-bar#myid)') | |
' class="foo-bar" id="myid"' | |
>>> t.pba('((((') | |
' style="padding-left:4em;"' | |
>>> t.pba(')))') | |
' style="padding-right:3em;"' | |
>>> t.pba('[fr]') | |
' lang="fr"' | |
""" | |
style = [] | |
aclass = '' | |
lang = '' | |
colspan = '' | |
rowspan = '' | |
id = '' | |
atts = '' | |
if not input: return '' | |
matched = input | |
if element == 'td': | |
m = re.search(r'\\(\d+)', matched) | |
if m: | |
colspan = m.group(1) | |
m = re.search(r'/(\d+)', matched) | |
if m: | |
rowspan = m.group(1) | |
if element == 'td' or element == 'tr': | |
m = re.search(r'(%s)' % self.vlgn, matched) | |
if m: style.append("vertical-align:%s;" % self.vAlign(m.group(1))) | |
m = re.search(r'\{([^}]*)\}', matched) | |
if m: | |
style.append(m.group(1).rstrip(';') + ';') | |
matched = matched.replace(m.group(0), '') | |
m = re.search(r'\[([^\]]+)\]', matched, re.U) | |
if m: | |
lang = m.group(1) | |
matched = matched.replace(m.group(0), '') | |
m = re.search(r'\(([^()]+)\)', matched, re.U) | |
if m: | |
aclass = m.group(1) | |
matched = matched.replace(m.group(0), '') | |
m = re.search(r'([(]+)', matched) | |
if m: | |
style.append("padding-left:%sem;" % len(m.group(1))) | |
matched = matched.replace(m.group(0), '') | |
m = re.search(r'([)]+)', matched) | |
if m: | |
style.append("padding-right:%sem;" % len(m.group(1))) | |
matched = matched.replace(m.group(0), '') | |
m = re.search(r'(%s)' % self.hlgn, matched) | |
if m: | |
style.append("text-align:%s;" % self.hAlign(m.group(1))) | |
m = re.search(r'^(.*)#(.*)$', aclass) | |
if m: | |
id = m.group(2) | |
aclass = m.group(1) | |
if self.restricted: | |
if lang: return ' lang="%s"' | |
else: return '' | |
result = [] | |
if style: result.append(' style="%s"' % "".join(style)) | |
if aclass: result.append(' class="%s"' % aclass) | |
if lang: result.append(' lang="%s"' % lang) | |
if id: result.append(' id="%s"' % id) | |
if colspan: result.append(' colspan="%s"' % colspan) | |
if rowspan: result.append(' rowspan="%s"' % rowspan) | |
return ''.join(result) | |
def hasRawText(self, text): | |
""" | |
checks whether the text has text not already enclosed by a block tag | |
>>> t = Textile() | |
>>> t.hasRawText('<p>foo bar biz baz</p>') | |
False | |
>>> t.hasRawText(' why yes, yes it does') | |
True | |
""" | |
r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|pre|h\d)[^>]*?>.*</\1>', re.S).sub('', text.strip()).strip() | |
r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r) | |
return '' != r | |
def table(self, text): | |
r""" | |
>>> t = Textile() | |
>>> t.table('|one|two|three|\n|a|b|c|') | |
'\t<table>\n\t\t<tr>\n\t\t\t<td>one</td>\n\t\t\t<td>two</td>\n\t\t\t<td>three</td>\n\t\t</tr>\n\t\t<tr>\n\t\t\t<td>a</td>\n\t\t\t<td>b</td>\n\t\t\t<td>c</td>\n\t\t</tr>\n\t</table>\n\n' | |
""" | |
text = text + "\n\n" | |
pattern = re.compile(r'^(?:table(_?%(s)s%(a)s%(c)s)\. ?\n)?^(%(a)s%(c)s\.? ?\|.*\|)\n\n' % {'s':self.s, 'a':self.a, 'c':self.c}, re.S|re.M|re.U) | |
return pattern.sub(self.fTable, text) | |
def fTable(self, match): | |
tatts = self.pba(match.group(1), 'table') | |
rows = [] | |
for row in [ x for x in match.group(2).split('\n') if x]: | |
rmtch = re.search(r'^(%s%s\. )(.*)' % (self.a, self.c), row.lstrip()) | |
if rmtch: | |
ratts = self.pba(rmtch.group(1), 'tr') | |
row = rmtch.group(2) | |
else: ratts = '' | |
cells = [] | |
for cell in row.split('|'): | |
ctyp = 'd' | |
if re.search(r'^_', cell): ctyp = "h" | |
cmtch = re.search(r'^(_?%s%s%s\. )(.*)' % (self.s, self.a, self.c), cell) | |
if cmtch: | |
catts = self.pba(cmtch.group(1), 'td') | |
cell = cmtch.group(2) | |
else: catts = '' | |
cell = self.graf(self.span(cell)) | |
if cell.strip() != '': | |
cells.append('\t\t\t<t%s%s>%s</t%s>' % (ctyp, catts, cell, ctyp)) | |
rows.append("\t\t<tr%s>\n%s\n\t\t</tr>" % (ratts, '\n'.join(cells))) | |
cells = [] | |
catts = None | |
return "\t<table%s>\n%s\n\t</table>\n\n" % (tatts, '\n'.join(rows)) | |
def lists(self, text): | |
""" | |
>>> t = Textile() | |
>>> t.lists("* one\\n* two\\n* three") | |
'\\t<ul>\\n\\t\\t<li>one</li>\\n\\t\\t<li>two</li>\\n\\t\\t<li>three</li>\\n\\t</ul>' | |
""" | |
pattern = re.compile(r'^([#*]+%s .*)$(?![^#*])' % self.c, re.U|re.M|re.S) | |
return pattern.sub(self.fList, text) | |
def fList(self, match): | |
text = match.group(0).split("\n") | |
result = [] | |
lists = [] | |
for i, line in enumerate(text): | |
try: | |
nextline = text[i+1] | |
except IndexError: | |
nextline = '' | |
m = re.search(r"^([#*]+)(%s%s) (.*)$" % (self.a, self.c), line, re.S) | |
if m: | |
tl, atts, content = m.groups() | |
nl = '' | |
nm = re.search(r'^([#*]+)\s.*', nextline) | |
if nm: | |
nl = nm.group(1) | |
if tl not in lists: | |
lists.append(tl) | |
atts = self.pba(atts) | |
line = "\t<%sl%s>\n\t\t<li>%s" % (self.lT(tl), atts, self.graf(content)) | |
else: | |
line = "\t\t<li>" + self.graf(content) | |
if len(nl) <= len(tl): line = line + "</li>" | |
for k in reversed(lists): | |
if len(k) > len(nl): | |
line = line + "\n\t</%sl>" % self.lT(k) | |
if len(k) > 1: | |
line = line + "</li>" | |
lists.remove(k) | |
result.append(line) | |
return "\n".join(result) | |
def lT(self, input): | |
if re.search(r'^#+', input): | |
return 'o' | |
else: | |
return 'u' | |
def doPBr(self, in_): | |
return re.compile(r'<(p)([^>]*?)>(.*)(</\1>)', re.S).sub(self.doBr, in_) | |
def doBr(self, match): | |
content = re.sub(r'(.+)(?:(?<!<br>)|(?<!<br />))\n(?![#*\s|])', '\\1<br />', match.group(3)) | |
return '<%s%s>%s%s' % (match.group(1), match.group(2), content, match.group(4)) | |
def block(self, text): | |
""" | |
>>> t = Textile() | |
>>> t.block('h1. foobar baby') | |
'\\t<h1>foobar baby</h1>' | |
""" | |
tre = '|'.join(self.btag) | |
text = text.split('\n\n') | |
tag = 'p' | |
atts = cite = graf = ext = '' | |
out = [] | |
anon = False | |
for line in text: | |
pattern = r'^(%s)(%s%s)\.(\.?)(?::(\S+))? (.*)$' % (tre, self.a, self.c) | |
match = re.search(pattern, line, re.S) | |
if match: | |
if ext: | |
out.append(out.pop() + c1) | |
tag,atts,ext,cite,graf = match.groups() | |
o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, cite, graf) | |
# leave off c1 if this block is extended, we'll close it at the start of the next block | |
if ext: | |
line = "%s%s%s%s" % (o1, o2, content, c2) | |
else: | |
line = "%s%s%s%s%s" % (o1, o2, content, c2, c1) | |
else: | |
anon = True | |
if ext or not re.search(r'^\s', line): | |
o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, cite, line) | |
# skip $o1/$c1 because this is part of a continuing extended block | |
if tag == 'p' and not self.hasRawText(content): | |
line = content | |
else: | |
line = "%s%s%s" % (o2, content, c2) | |
else: | |
line = self.graf(line) | |
line = self.doPBr(line) | |
line = re.sub(r'<br>', '<br />', line) | |
if ext and anon: | |
out.append(out.pop() + "\n" + line) | |
else: | |
out.append(line) | |
if not ext: | |
tag = 'p' | |
atts = '' | |
cite = '' | |
graf = '' | |
if ext: | |
out.append(out.pop() + c1) | |
return '\n\n'.join(out) | |
def fBlock(self, tag, atts, ext, cite, content): | |
""" | |
>>> t = Textile() | |
>>> t.fBlock("bq", "", None, "", "Hello BlockQuote") | |
('\\t<blockquote>\\n', '\\t\\t<p>', 'Hello BlockQuote', '</p>', '\\n\\t</blockquote>') | |
>>> t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote") | |
('\\t<blockquote cite="http://google.com">\\n', '\\t\\t<p>', 'Hello BlockQuote', '</p>', '\\n\\t</blockquote>') | |
>>> t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS | |
('<pre>', '<code>', ..., '</code>', '</pre>') | |
>>> t.fBlock("h1", "", None, "", "foobar") | |
('', '\\t<h1>', 'foobar', '</h1>', '') | |
""" | |
atts = self.pba(atts) | |
o1 = o2 = c2 = c1 = '' | |
m = re.search(r'fn(\d+)', tag) | |
if m: | |
tag = 'p' | |
if m.group(1) in self.fn: | |
fnid = self.fn[m.group(1)] | |
else: | |
fnid = m.group(1) | |
atts = atts + ' id="fn%s"' % fnid | |
if atts.find('class=') < 0: | |
atts = atts + ' class="footnote"' | |
content = ('<sup>%s</sup>' % m.group(1)) + content | |
if tag == 'bq': | |
cite = self.checkRefs(cite) | |
if cite: | |
cite = ' cite="%s"' % cite | |
else: | |
cite = '' | |
o1 = "\t<blockquote%s%s>\n" % (cite, atts) | |
o2 = "\t\t<p%s>" % atts | |
c2 = "</p>" | |
c1 = "\n\t</blockquote>" | |
elif tag == 'bc': | |
o1 = "<pre%s>" % atts | |
o2 = "<code%s>" % atts | |
c2 = "</code>" | |
c1 = "</pre>" | |
content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) | |
elif tag == 'notextile': | |
content = self.shelve(content) | |
o1 = o2 = '' | |
c1 = c2 = '' | |
elif tag == 'pre': | |
content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) | |
o1 = "<pre%s>" % atts | |
o2 = c2 = '' | |
c1 = '</pre>' | |
else: | |
o2 = "\t<%s%s>" % (tag, atts) | |
c2 = "</%s>" % tag | |
content = self.graf(content) | |
return o1, o2, content, c2, c1 | |
def footnoteRef(self, text): | |
""" | |
>>> t = Textile() | |
>>> t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS | |
'foo<sup class="footnote"><a href="#fn...">1</a></sup> ' | |
""" | |
return re.sub(r'\b\[([0-9]+)\](\s)?', self.footnoteID, text) | |
def footnoteID(self, match): | |
id, t = match.groups() | |
if id not in self.fn: | |
self.fn[id] = str(uuid.uuid4()) | |
fnid = self.fn[id] | |
if not t: t = '' | |
return '<sup class="footnote"><a href="#fn%s">%s</a></sup>%s' % (fnid, id, t) | |
def glyphs(self, text): | |
""" | |
>>> t = Textile() | |
>>> t.glyphs("apostrophe's") | |
'apostrophe’s' | |
>>> t.glyphs("back in '88") | |
'back in ’88' | |
>>> t.glyphs('foo ...') | |
'foo …' | |
>>> t.glyphs('--') | |
'—' | |
>>> t.glyphs('FooBar[tm]') | |
'FooBar™' | |
>>> t.glyphs("<p><cite>Cat's Cradle</cite> by Vonnegut</p>") | |
'<p><cite>Cat’s Cradle</cite> by Vonnegut</p>' | |
""" | |
# fix: hackish | |
text = re.sub(r'"\z', '\" ', text) | |
glyph_search = ( | |
re.compile(r"(\w)\'(\w)"), # apostrophe's | |
re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88 | |
re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing | |
re.compile(r'\'/'), # single opening | |
re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing | |
re.compile(r'"'), # double opening | |
re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym | |
re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase | |
re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis | |
re.compile(r'(\s?)--(\s?)'), # em dash | |
re.compile(r'\s-(?:\s|$)'), # en dash | |
re.compile(r'(\d+)( ?)x( ?)(?=\d+)'), # dimension sign | |
re.compile(r'\b ?[([]TM[])]', re.I), # trademark | |
re.compile(r'\b ?[([]R[])]', re.I), # registered | |
re.compile(r'\b ?[([]C[])]', re.I), # copyright | |
) | |
glyph_replace = [x % dict(self.glyph_defaults) for x in ( | |
r'\1%(txt_apostrophe)s\2', # apostrophe's | |
r'\1%(txt_apostrophe)s\2', # back in '88 | |
r'\1%(txt_quote_single_close)s', # single closing | |
r'%(txt_quote_single_open)s', # single opening | |
r'\1%(txt_quote_double_close)s', # double closing | |
r'%(txt_quote_double_open)s', # double opening | |
r'<acronym title="\2">\1</acronym>', # 3+ uppercase acronym | |
r'<span class="caps">\1</span>', # 3+ uppercase | |
r'\1%(txt_ellipsis)s', # ellipsis | |
r'\1%(txt_emdash)s\2', # em dash | |
r' %(txt_endash)s ', # en dash | |
r'\1\2%(txt_dimension)s\3', # dimension sign | |
r'%(txt_trademark)s', # trademark | |
r'%(txt_registered)s', # registered | |
r'%(txt_copyright)s', # copyright | |
)] | |
result = [] | |
for line in re.compile(r'(<.*?>)', re.U).split(text): | |
if not re.search(r'<.*>', line): | |
for s, r in zip(glyph_search, glyph_replace): | |
line = s.sub(r, line) | |
result.append(line) | |
return ''.join(result) | |
def iAlign(self, input): | |
d = {'<':'left', '=':'center', '>':'right'} | |
return d.get(input, '') | |
def vAlign(self, input): | |
d = {'^':'top', '-':'middle', '~':'bottom'} | |
return d.get(input, '') | |
def hAlign(self, input): | |
d = {'<':'left', '=':'center', '>':'right', '<>': 'justify'} | |
return d.get(input, '') | |
def getRefs(self, text): | |
""" | |
what is this for? | |
""" | |
pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http:\/\/|\/)\S+)(?=\s|$)', re.U) | |
text = pattern.sub(self.refs, text) | |
return text | |
def refs(self, match): | |
flag, url = match.groups() | |
self.urlrefs[flag] = url | |
return '' | |
def checkRefs(self, url): | |
return self.urlrefs.get(url, url) | |
def relURL(self, url): | |
o = urlparse(url) | |
(scheme,netloc,path,params,query,fragment) = o[0:6] | |
if (not scheme or scheme == 'http') and not netloc and re.search(r'^\w', path): | |
url = self.hu + url | |
if self.restricted and scheme and scheme not in self.url_schemes: | |
return '#' | |
return url | |
def shelve(self, text): | |
id = str(uuid.uuid4()) | |
self.shelf[id] = text | |
return id | |
def retrieve(self, text): | |
""" | |
>>> t = Textile() | |
>>> id = t.shelve("foobar") | |
>>> t.retrieve(id) | |
'foobar' | |
""" | |
while True: | |
old = text | |
for k,v in self.shelf.items(): | |
text = text.replace(k,v) | |
if text == old: break | |
return text | |
def encode_html(self, text, quotes=True): | |
a = ( | |
('&', '&'), | |
('<', '<'), | |
('>', '>') | |
) | |
if quotes: | |
a = a + ( | |
("'", '''), | |
('"', '"') | |
) | |
for k,v in a: | |
text = text.replace(k,v) | |
return text | |
def graf(self, text): | |
if not self.lite: | |
text = self.noTextile(text) | |
text = self.code(text) | |
text = self.links(text) | |
if not self.noimage: | |
text = self.image(text) | |
if not self.lite: | |
text = self.lists(text) | |
text = self.table(text) | |
text = self.span(text) | |
text = self.footnoteRef(text) | |
text = self.glyphs(text) | |
return text.rstrip('\n') | |
def links(self, text): | |
""" | |
>>> t = Textile() | |
>>> t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS | |
'fooobar ... and hello world ...' | |
""" | |
punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' | |
pattern = r''' | |
([\s\[{(]|[%s])? # $pre | |
" # start | |
(%s) # $atts | |
([^"]+?) # $text | |
\s? | |
(?:\(([^)]+?)\)(?="))? # $title | |
": | |
(\S+?) # $url | |
(\/)? # $slash | |
([^\w\/;]*?) # $post | |
(?=<|\s|$) | |
''' % (re.escape(punct), self.c) | |
text = re.compile(pattern, re.X).sub(self.fLink, text) | |
return text | |
def fLink(self, match): | |
pre, atts, text, title, url, slash, post = match.groups() | |
if pre == None: | |
pre = '' | |
url = self.checkRefs(url) | |
atts = self.pba(atts) | |
if title: atts = atts + ' title="%s"' % self.encode_html(title) | |
if not self.noimage: | |
text = self.image(text) | |
text = self.span(text) | |
text = self.glyphs(text) | |
url = self.relURL(url) | |
if slash: url = url + slash | |
out = '<a href="%s"%s%s>%s</a>' % (self.encode_html(url), atts, self.rel, text) | |
out = self.shelve(out) | |
return ''.join([pre, out, post]) | |
def span(self, text): | |
""" | |
>>> t = Textile() | |
>>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye") | |
'hello <span class="bob">span <strong>strong</strong> and <b>bold</b></span> goodbye' | |
""" | |
qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') | |
pnct = ".,\"'?!;:" | |
for qtag in qtags: | |
pattern = re.compile(r""" | |
(?:^|(?<=[\s>%(pnct)s])|([\]}])) | |
(%(qtag)s)(?!%(qtag)s) | |
(%(c)s) | |
(?::(\S+))? | |
([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n]) | |
([%(pnct)s]*) | |
%(qtag)s | |
(?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s)) | |
""" % {'qtag':qtag,'c':self.c,'pnct':pnct,'selfpnct':self.pnct}, re.X) | |
text = pattern.sub(self.fSpan, text) | |
return text | |
def fSpan(self, match): | |
_, tag, atts, cite, content, end, _ = match.groups() | |
qtags = { | |
'*': 'strong', | |
'**': 'b', | |
'??': 'cite', | |
'_' : 'em', | |
'__': 'i', | |
'-' : 'del', | |
'%' : 'span', | |
'+' : 'ins', | |
'~' : 'sub', | |
'^' : 'sup' | |
} | |
tag = qtags[tag] | |
atts = self.pba(atts) | |
if cite: | |
atts = atts + 'cite="%s"' % cite | |
out = "<%s%s>%s%s</%s>" % (tag, atts, content, end, tag) | |
return out; | |
def image(self, text): | |
""" | |
>>> t = Textile() | |
>>> t.image('!/imgs/myphoto.jpg!:http://jsamsa.com') | |
'<a href="http://jsamsa.com"><img src="/imgs/myphoto.jpg" alt="" /></a>' | |
""" | |
pattern = re.compile(r""" | |
(?:[\[{])? # pre | |
\! # opening ! | |
(\<|\=|\>)?? # optional alignment atts | |
(%s) # optional style,class atts | |
(?:\. )? # optional dot-space | |
([^\s(!]+) # presume this is the src | |
\s? # optional space | |
(?:\(([^\)]+)\))? # optional title | |
\! # closing | |
(?::(\S+))? # optional href | |
(?:[\]}]|(?=\s|$)) # lookahead: space or end of string | |
""" % self.c, re.U|re.X) | |
return pattern.sub(self.fImage, text) | |
def fImage(self, match): | |
# (None, '', '/imgs/myphoto.jpg', None, None) | |
algn, atts, url, title, href = match.groups() | |
atts = self.pba(atts) | |
if algn: | |
atts = atts + ' align="%s"' % self.iAlign(algn) | |
if title: | |
atts = atts + ' title="%s" alt="%s"' % (title, title) | |
else: | |
atts = atts + ' alt=""' | |
# TODO how to do this in python? | |
# size = @getimagesize(url) | |
# if (size) atts .= " size[3]" | |
if href: | |
href = self.checkRefs(href) | |
url = self.checkRefs(url) | |
url = self.relURL(url) | |
out = [] | |
if href: out.append('<a href="%s">' % href) | |
out.append('<img src="%s"%s />' % (url, atts)) | |
if href: out.append('</a>') | |
return ''.join(out) | |
def code(self, text): | |
text = self.doSpecial(text, '<code>', '</code>', self.fCode) | |
text = self.doSpecial(text, '@', '@', self.fCode) | |
text = self.doSpecial(text, '<pre>', '</pre>', self.fPre) | |
return text | |
def fCode(self, match): | |
before, text, after = match.groups() | |
if after == None: after = '' | |
# text needs to be escaped | |
if not self.restricted: | |
text = self.encode_html(text) | |
return ''.join([before, self.shelve('<code>%s</code>' % text), after]) | |
def fPre(self, match): | |
before, text, after = match.groups() | |
if after == None: after = '' | |
# text needs to be escapedd | |
if not self.restricted: | |
text = self.encode_html(text) | |
return ''.join([before, '<pre>', self.shelve(text), '</pre>', after]) | |
def doSpecial(self, text, start, end, method=None): | |
if method == None: | |
method = self.fSpecial | |
pattern = re.compile(r'(^|\s|[\[({>])%s(.*?)%s(\s|$|[\])}])?' % (re.escape(start), re.escape(end)), re.M|re.S) | |
return pattern.sub(method, text) | |
def fSpecial(self, match): | |
""" | |
special blocks like notextile or code | |
""" | |
before, text, after = match.groups() | |
if after == None: after = '' | |
return ''.join([before, self.shelve(self.encode_html(text)), after]) | |
def noTextile(self, text): | |
text = self.doSpecial(text, '<notextile>', '</notextile>', self.fTextile) | |
return self.doSpecial(text, '==', '==', self.fTextile) | |
def fTextile(self, match): | |
before, notextile, after = match.groups() | |
if after == None: after = '' | |
return ''.join([before, self.shelve(notextile), after]) | |
def textile(text, **args): | |
""" | |
this function takes additional parameters: | |
encoding - input encoding (default: 'utf-8') | |
output - output encoding (default: 'utf-8') | |
validate - perform mxTidy or uTidyLib validation (default: False) | |
sanitize - sanitize output good for weblog comments (default: False) | |
head_offset - ignored | |
""" | |
return Textile().textile(text, **args) | |
def _test(): | |
import doctest | |
doctest.testmod() | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) == 2: | |
f = open(sys.argv[1]) | |
text = ''.join(f.readlines()) | |
print Textile().textile(text) | |
else: | |
_test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment