Created
January 14, 2014 20:11
-
-
Save edunham/8424787 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/lib python2.7 | |
## | |
# NAME | |
# wiki2rst.py -- convert Dokuwiki page syntax to reStructured Text | |
# | |
# REQUIRES | |
# pandoc (ext.program) | |
# | |
# SYNTAX | |
# wiki2rst.py pages pages/tutorials/tutorial-part-4-2.txt (single file) | |
# wiki2rst.py pages (directory tree) | |
# | |
# DESCRIPTION | |
# wiki2rst.py takes .txt files with Dokuwiki syntax, and converts them to | |
# corresponding .rst markup. You either run it on a single file or an entire | |
# directory tree. In the single-file invocation output is written to STDOUT, | |
# for directory trees, .rst files will be created corresponding to the input | |
# file paths. The first argument to the script is always a root directory | |
# from which e.g. internal links are calculated. | |
# For embedded HTML in Dokuwiki files it relies on the pandoc [1] program to | |
# convert it to reST. | |
# | |
# CAVEATS | |
# After the conversion the resulting .rst markup most likely needs some | |
# finishing touches to get the desired rendering. | |
# There is probably some legacy junk code in it, stemming from the time when we | |
# ported the entire manual from Dokuwiki to reST. | |
# | |
# [1] http://www.johnmacfarlane.net/pandoc/ | |
## | |
import sys, os, re, codecs | |
from pyparsing import * | |
# -- transform stuff ------------------------ | |
def convertToRst(opening,closing): | |
def conversionParseAction(s,l,t): | |
return opening + wikiMarkup.transformString(t[0]) + closing | |
return conversionParseAction | |
ns12 = ":1.2:" | |
headLevel = 0 | |
def internalLink(url): | |
# wiki links are all-lowercase | |
url = url.lower() | |
# manual url | |
if url.find(ns12)>-1: | |
url = url[url.find(ns12)+len(ns12):] | |
if currentInput.endswith("index.txt"): | |
url = "pages/" + url | |
# wiki url | |
else: | |
if url.startswith(".:"): | |
url = url[2:] | |
if url.startswith("."): | |
url = url[1:] | |
if (url.startswith(":") or url.startswith("documentation") or | |
url.startswith("about")): | |
if url.startswith(":"): | |
url = url[1:] | |
url = "http*//qooxdoo.org/" + url | |
# this covers not everything, e.g. "lowleveloverview" -> documentation/lowleveloverview | |
url = url.replace(":", "/") | |
url = url.replace("*", ":") | |
base, ext = os.path.splitext(url) | |
if not ext: | |
#url += ".html" | |
pass | |
return url | |
fileset = set(()) | |
def absolutizeUrl(url, withext=False): | |
# generator_config_articles#packages_key -> pages/tool/generator_config_article#packages_key | |
# force all-lowercase | |
url = url.lower() | |
if url.find('#')>-1: # strip fragment id | |
fragment = url[url.find('#'):] | |
base = url[:url.find('#')] | |
else: | |
fragment = "" | |
base = url | |
for knownbase in fileset: | |
if knownbase.endswith(base): | |
return knownbase + fragment | |
for root,dirs,files in os.walk(dirroot): | |
for file in files: | |
fbase,ext = os.path.splitext(file) | |
fullbase = os.path.join(root, fbase) | |
if withext: | |
fullbase += ext | |
fullbase = fullbase.lower() | |
fullbase = os.path.normpath(fullbase) # get rid of leading "./" | |
if fullbase.endswith(base): | |
fileset.add(fullbase) | |
return fullbase + fragment | |
eout(" could not absolutize url: %s" % url) | |
return url | |
def convertToRst_A(s,l,t): | |
i = t[0].find("|") | |
if i>-1: | |
url = t[0][:i] | |
text= t[0][i+1:] | |
else: | |
url = t[0] | |
text= "" | |
text = text.strip() | |
url = url.strip() | |
# Handle url | |
if url.startswith("http://"): | |
return '`%s <%s>`_' % (text,url) | |
# handle intrawiki link | |
else: | |
url = internalLink(url) | |
# intrawiki link could be turned into an external link | |
if url.startswith("http://"): | |
return '`%s <%s>`_' % (text,url) | |
# decide about link keyword | |
if url.find('#')>-1: # we have a document fragment identifier | |
keyword = "ref" | |
if url.startswith('#'): # have to use the full label | |
path,ext = os.path.splitext(currentInput) | |
url = path + url | |
else: | |
url = absolutizeUrl(url) | |
else: # entire document link | |
keyword = "doc" | |
# put together with text | |
if text: | |
return ':%s:`%s <%s>`' % (keyword,text,url) | |
else: | |
return ':%s:`%s`' % (keyword,url) | |
def convertToRst_I(s,l,t): | |
import random | |
# split text and url | |
i = t[0].find("|") | |
if i>-1: | |
url = t[0][:i] | |
text= t[0][i+1:] | |
else: | |
url = t[0] | |
text= "" | |
text = text.strip() | |
# remove trailing artifacts | |
url = re.sub(r'\?.*$', '', url) | |
# Handle url | |
if url.startswith("http://"): | |
pass | |
# handle document:1.2: | |
elif url.find(ns12)>-1: | |
url = internalLink(url) | |
url = absolutizeUrl(url, withext=True) | |
if url.startswith("pages/"): | |
url = "/" + url # need to 'absolutize' internal img refs, http://sphinx.pocoo.org/rest.html#images | |
else: | |
#raise ParseFatalException(s,l,"invalid URL link reference: " + t[0]) | |
#print >> sys.stderr, ("invalid image reference: " + t[0]) | |
#return t[0] | |
pass # should do as is | |
# TODO: if the image ref is within the text flow, ret will break the text! | |
# (which is not often the case) | |
if imgEnd.search(url): | |
# Handle text | |
if not text: | |
text = "image" + str(random.choice(range(1000))) | |
if text.find(ns12)>-1: | |
text = internalLink(text) # make the alt string look like the img uri | |
ret = "|%s|\n\n.. |%s| image:: %s\n\n" % (text,text,url) | |
else : # .pdf, .zip, ... | |
if text: | |
ret = ':doc:`%s <%s>`' % (text,url) | |
else: | |
ret = ':doc:`%s`' % url | |
return ret | |
wikiNoChars = re.compile(r'["\')(?!]') | |
def wikianchor(txt): | |
txt = txt.lower() | |
txt = wikiNoChars.sub('',txt,) | |
txt = txt.replace(" ", "_") | |
return txt | |
def convertHeading(linechar): | |
mylevel = [x[1] for x,y in linechars.items() if y==linechar][0] | |
mylevel = int(mylevel) | |
def parseAction(s,l,t): | |
global headLevel | |
# insert intermediate heading? | |
if headLevel > 0 and mylevel - headLevel > 1: # new header skips a level | |
# insert intermediate headers to assure consistency | |
prehead = "" | |
for i in range(1, mylevel - headLevel): | |
lchar = linechars["h"+str(headLevel+i)] | |
prehead += "\nXXX\n" + 3 * lchar + "\n\n" | |
else: | |
prehead = "" | |
headLevel = mylevel | |
txt = t[0].strip() | |
# create jump label | |
path,ext = os.path.splitext(currentInput) | |
anchor = wikianchor(txt) | |
label = "\n.. _" + path + "#" + anchor + ":\n\n" | |
# put it together | |
txt = prehead + label + txt + "\n" + linechar * len(txt) | |
return txt | |
return parseAction | |
def reindentBlock(s,n): | |
s1 = s.split('\n') | |
s1 = [(n * ' ') + line for line in s1] | |
s1 = "\n".join(s1) | |
return s1 | |
def convertCodeBlock(s,l,t): | |
# t = [ 'code', ['javascript'], False, 'qx.Class.define(....)', '</code>'] | |
# TODO: the next leaves the :: on a line of its own, where it should be | |
# appended to the previous paragraph! | |
if len(t) < 5: | |
code = t[2] | |
else: | |
code = t[3] | |
res = "\n::\n\n" | |
res += reindentBlock(code, 4) | |
res += "\n" | |
return res | |
def convertTable(s,l,t): | |
def makeline(fields, colwids): | |
s = "" | |
for field, colwid in zip(fields,colwids): | |
s += field + " " * (colwid - len(field)) | |
s += " " | |
s += '\n' | |
return s | |
# t = ['header', '\nfirst row\nsecond row\n...', '\n\n'] | |
#eout(repr(tabcnt)+"\n\n\n") | |
# split fields | |
header = t[0].split('^')[1:-1] | |
colcnt = len(header) | |
rows = [] | |
for row in t[1].split('\n')[1:]: | |
rows.append(row.split('|')[1:-1]) | |
maxcols = [] | |
# get longest field for each row | |
for colidx in range(colcnt): | |
maxcol = 0 | |
for row in [header] + rows: | |
fieldlen = len(row[colidx]) | |
if fieldlen > maxcol: | |
maxcol = fieldlen | |
maxcols.append(maxcol) | |
# construct separator | |
seps = [] | |
for colwid in maxcols: | |
seps.append(colwid * "=") | |
sep = " ".join(seps) | |
# put it together | |
ret = sep + '\n' | |
ret += makeline(header,maxcols) | |
ret += sep + '\n' | |
for row in rows: | |
ret += makeline(row,maxcols) | |
ret += sep + '\n\n\n' | |
return ret | |
def convertNoteBlock(s,l,t): | |
# t = [ 'code', ['javascript'], False, 'qx.Class.define(....)', '</code>'] | |
# TODO: the next leaves the :: on a line of its own, where it should be | |
# appended to the previous paragraph! | |
if len(t) < 5: | |
code = t[2] | |
else: | |
code = t[3] | |
res = "\n.. note::\n\n" | |
res += reindentBlock(wikiMarkup.transformString(code), 4) | |
res += "\nxxx\n" # make sure the note is closed | |
return res | |
def eout(s): | |
print >> sys.stderr, s | |
def convertHtml(i): | |
def htmlBlock(s,l,t,): | |
import subprocess | |
eout(" converting HTML with pandoc") | |
#eout(t) | |
# preprocess | |
s = t[i] | |
s = "<html>" + s + "</html>" | |
s = s.replace(" ", " ") | |
p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'], | |
stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
ret = p.communicate(s)[0] | |
# postprocess | |
ret = ret.replace(os.linesep, "\n") | |
if ret.endswith("\n"): | |
ret = ret[:-1] | |
#ret = ret.decode('ascii', 'replace') | |
#eout("%r" % ret) | |
return ret | |
return htmlBlock | |
# these are the sphinx recommended heading ornaments | |
linechars = { | |
"h1" : "#", | |
"h2" : "*", | |
"h3" : "=", | |
"h4" : "-", | |
"h5" : "^", | |
"h6" : '"' | |
} | |
italicized1= QuotedString("//").setParseAction(convertToRst("*","*")) | |
italicized2= QuotedString("__").setParseAction(convertToRst("*","*")) | |
boldItalic = QuotedString("***").setParseAction(convertToRst("**","**")) # have to chose one | |
boldItalic = QuotedString("**//",endQuoteChar="//**").setParseAction(convertToRst("**","**")) # have to chose one | |
urlRef = QuotedString("[[",endQuoteChar="]]").setParseAction(convertToRst_A) | |
#h1 - haven't really used part headings so far | |
h2 = QuotedString("======").setParseAction(convertHeading(linechars["h2"])) | |
h3 = QuotedString("=====").setParseAction(convertHeading(linechars["h3"])) | |
h4 = QuotedString("====").setParseAction(convertHeading(linechars["h4"])) | |
h5 = QuotedString("===").setParseAction(convertHeading(linechars["h5"])) | |
h6 = QuotedString("==").setParseAction(convertHeading(linechars["h6"])) | |
code = QuotedString("''").setParseAction(convertToRst("``","``")) | |
myHtmlBlock = convertHtml(0) | |
html = QuotedString("<html>", endQuoteChar="</html>").setParseAction(myHtmlBlock) | |
imgRef = QuotedString("{{",endQuoteChar="}}").setParseAction(convertToRst_I) | |
br = Literal(r'\\ ').setParseAction(replaceWith("")) | |
htmlOpen,htmlClose = makeHTMLTags("html") | |
htmlEmb = htmlOpen + SkipTo(htmlClose) + htmlClose | |
#myHtmlBlock = functools.partial(htmlBlock, i=2) | |
myHtmlBlock = convertHtml(2) | |
htmlEmb.setParseAction(myHtmlBlock) | |
#htmlEmb.setParseAction(htmlBlock) | |
codeOpen, codeClose = makeHTMLTags("code") | |
codeBlock = codeOpen + SkipTo(codeClose, ) + codeClose | |
codeBlock.setParseAction(convertCodeBlock) | |
noteOpen, noteClose = makeHTMLTags("note") | |
noteBlock = noteOpen + SkipTo(noteClose, ) + noteClose | |
noteBlock.setParseAction(convertNoteBlock) | |
tableOpen, tableClose = Regex(r'^\^.*\^$', re.M), Literal("\n\n").leaveWhitespace() | |
tableBlock = tableOpen + SkipTo(tableClose) + tableClose | |
tableBlock.setParseAction(convertTable) | |
wikiMarkup = ( | |
italicized1 | | |
italicized2 | | |
boldItalic | | |
urlRef | imgRef | | |
h2 | h3 | h4 | h5 | h6 | | |
code | codeBlock | | |
noteBlock | | |
html | br | | |
tableBlock | |
) | |
# -- file handling --------------------------- | |
txtEnd = re.compile("\.txt$", re.I) | |
imgEnd = re.compile("\.(png|gif|jpg|jpeg)", re.I) | |
repeatedNewlines = LineEnd() + OneOrMore(LineEnd()) | |
repeatedNewlines.setParseAction(replaceWith("\n\n")) | |
def rmEmptyLines(s): | |
return repeatedNewlines.transformString(s) | |
def transform(path): | |
global headLevel | |
global currentInput | |
print "transforming: %s" % path | |
headLevel = 0 # reset heading level for new document | |
currentInput = path | |
wikiInput = codecs.open(path, "rU", "utf-8").read() | |
rstString = wikiMarkup.transformString(wikiInput) | |
# first pass leaves many blank lines, collapse these down | |
rstString = rmEmptyLines(rstString) | |
if len(rstString)>0: | |
codecs.open(txtEnd.sub(".rst", path), "w", "utf-8").write(rstString) | |
def main(path): | |
assert(os.path.isdir(path)) | |
global dirroot | |
dirroot = path | |
for root,dirs,files in os.walk(path): | |
for file in (f for f in files if f.endswith(".txt")): | |
transform(os.path.join(root,file)) | |
if __name__ == "__main__": | |
wikiInput = "" | |
if len(sys.argv)>1: | |
if len(sys.argv)>2: | |
if os.path.isfile(sys.argv[2]): | |
currentInput = sys.argv[2] | |
dirroot = os.path.dirname(sys.argv[1]) or '.' | |
wikiInput = codecs.open(sys.argv[2], "rU", "utf-8").read() | |
elif os.path.isdir(sys.argv[1]): | |
main(sys.argv[1]) | |
else: | |
wikiInput = """ | |
Here is a simple Wiki input: | |
//This is in italics.// | |
__This is also in italics.__ | |
**This is in bold!** | |
***This is in bold italics!*** | |
Here's a URL to [[http://qooxdoo.org|qooxdoo's homepage]] | |
""" | |
if wikiInput: | |
res = wikiMarkup.transformString(wikiInput) | |
res = rmEmptyLines(res) | |
print res.encode('ascii', 'xmlcharrefreplace') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment