Created
July 17, 2013 12:57
-
-
Save melix/6020336 to your computer and use it in GitHub Desktop.
Convert Confluence HTML export into asciidoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab('net.sourceforge.htmlcleaner:htmlcleaner:2.4') | |
import org.htmlcleaner.* | |
def src = new File('html').toPath() | |
def dst = new File('asciidoc').toPath() | |
def cleaner = new HtmlCleaner() | |
def props = cleaner.properties | |
props.translateSpecialEntities = false | |
def serializer = new SimpleHtmlSerializer(props) | |
src.toFile().eachFileRecurse { f -> | |
def relative = src.relativize(f.toPath()) | |
def target = dst.resolve(relative) | |
if (f.isDirectory()) { | |
target.toFile().mkdir() | |
} else if (f.name.endsWith('.html')) { | |
def tmpHtml = File.createTempFile('clean', 'html') | |
println "Converting $relative" | |
def result = cleaner.clean(f) | |
result.traverse({ tagNode, htmlNode -> | |
tagNode?.attributes?.remove 'class' | |
if ('td' == tagNode?.name || 'th'==tagNode?.name) { | |
tagNode.name='td' | |
String txt = tagNode.text | |
tagNode.removeAllChildren() | |
tagNode.insertChild(0, new ContentNode(txt)) | |
} | |
true | |
} as TagNodeVisitor) | |
serializer.writeToFile( | |
result, tmpHtml.absolutePath, "utf-8" | |
) | |
"pandoc -f html -t asciidoc -R -S --normalize -s $tmpHtml -o ${target}.adoc".execute().waitFor() | |
tmpHtml.delete() | |
}/* else { | |
"cp html/$relative $target".execute() | |
}*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Is the line 22,
tagNode?.attributes?.remove 'class'
, a good idea? For me it breaks conversion of code blocks.