-
-
Save boostrack/21ad559b4f5e58c7e088e6741c87169b to your computer and use it in GitHub Desktop.
Convert Confluence HTML export into asciidoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab('net.sourceforge.htmlcleaner:htmlcleaner:2.4') | |
import org.htmlcleaner.* | |
def src = new File('html').toPath() | |
def dst = new File('asciidoc').toPath() | |
def cleaner = new HtmlCleaner() | |
def props = cleaner.properties | |
props.translateSpecialEntities = false | |
def serializer = new SimpleHtmlSerializer(props) | |
src.toFile().eachFileRecurse { f -> | |
def relative = src.relativize(f.toPath()) | |
def target = dst.resolve(relative) | |
if (f.isDirectory()) { | |
target.toFile().mkdir() | |
} else if (f.name.endsWith('.html')) { | |
def tmpHtml = File.createTempFile('clean', 'html') | |
println "Converting $relative" | |
def result = cleaner.clean(f) | |
result.traverse({ tagNode, htmlNode -> | |
tagNode?.attributes?.remove 'class' | |
if ('td' == tagNode?.name || 'th'==tagNode?.name) { | |
tagNode.name='td' | |
String txt = tagNode.text | |
tagNode.removeAllChildren() | |
tagNode.insertChild(0, new ContentNode(txt)) | |
} | |
true | |
} as TagNodeVisitor) | |
serializer.writeToFile( | |
result, tmpHtml.absolutePath, "utf-8" | |
) | |
"pandoc -f html -t asciidoc -R -S --normalize -s $tmpHtml -o ${target}.adoc".execute().waitFor() | |
tmpHtml.delete() | |
}/* else { | |
"cp html/$relative $target".execute() | |
}*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment