-
-
Save bdabelow/67db92c7bd33687353fd8a07ede9ff5c to your computer and use it in GitHub Desktop.
Convert Confluence HTML export into asciidoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab('net.sourceforge.htmlcleaner:htmlcleaner:2.4') | |
import org.htmlcleaner.* | |
def src = new File('html').toPath() | |
def dst = new File('asciidoc').toPath() | |
def cleaner = new HtmlCleaner() | |
def props = cleaner.properties | |
props.translateSpecialEntities = false | |
def serializer = new SimpleHtmlSerializer(props) | |
src.toFile().eachFileRecurse { f -> | |
def relative = src.relativize(f.toPath()) | |
def target = dst.resolve(relative) | |
if (f.isDirectory()) { | |
target.toFile().mkdir() | |
} else if (f.name.endsWith('.html')) { | |
def tmpHtml = File.createTempFile('clean', 'html') | |
println "Converting $relative" | |
def result = cleaner.clean(f) | |
result.traverse({ tagNode, htmlNode -> | |
tagNode?.attributes?.remove 'class' | |
if ('td' == tagNode?.name || 'th'==tagNode?.name) { | |
tagNode.name='td' | |
String txt = tagNode.text | |
tagNode.removeAllChildren() | |
tagNode.insertChild(0, new ContentNode(txt)) | |
} | |
true | |
} as TagNodeVisitor) | |
serializer.writeToFile( | |
result, tmpHtml.absolutePath, "utf-8" | |
) | |
println "Target: ${target}.adoc" | |
cmdline = "pandoc -f html+raw_html+smart -t asciidoc -s $tmpHtml -o ${target}.adoc" | |
proc = cmdline.execute() | |
proc.waitFor() | |
tmpHtml.delete() | |
if ( proc.exitValue() != 0 ) { | |
println "\nCommand returned error: $cmdline\n" | |
println proc.err.text | |
System.exit(proc.exitValue()) | |
} | |
}/* else { | |
"cp html/$relative $target".execute() | |
}*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment