Created
January 26, 2015 16:35
-
-
Save bdkosher/9c9086ef66dea3a4cc9e to your computer and use it in GitHub Desktop.
RTF to HTML with Apache Tika
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@Grab(group='org.apache.tika', module='tika-core', version='1.7') | |
@Grab(group='org.apache.tika', module='tika-parsers', version='1.7') | |
import org.apache.tika.metadata.* | |
import org.apache.tika.parser.* | |
import org.apache.tika.parsers.* | |
import org.apache.tika.sax.* | |
import javax.xml.transform.* | |
import javax.xml.transform.sax.* | |
import javax.xml.transform.stream.* | |
import org.xml.sax.* | |
def rtfDir = new File(/C:\dev\data\rtf/) | |
def htmlDir = new File(/C:\dev\data\html/) | |
Metadata metadata = new Metadata() { | |
// workaround for https://issues.apache.org/jira/browse/TIKA-1168 | |
@Override | |
public void add(Property prop, String value) { | |
if (prop.propertyType != Property.PropertyType.SIMPLE) { | |
super.add(prop, value) | |
} | |
} | |
} | |
metadata.set(Metadata.CONTENT_TYPE, 'application/rtf') | |
rtfDir.eachFileMatch(~/.*\.rtf/) { rtf -> | |
def parser = new org.apache.tika.parser.rtf.RTFParser() | |
StringWriter sw = new StringWriter() | |
def handler = SAXTransformerFactory.newInstance().newTransformerHandler() | |
handler.transformer.setOutputProperty(OutputKeys.METHOD, 'xml') | |
handler.transformer.setOutputProperty(OutputKeys.INDENT, 'yes') | |
handler.transformer.setOutputProperty(OutputKeys.ENCODING, 'windows-1252') | |
handler.setResult(new StreamResult(sw)) | |
rtf.withInputStream { input -> | |
try { | |
parser.parse(input, handler, metadata, new ParseContext()) | |
} catch (e) { | |
println "Unable to parse $rtf : $e.message" | |
e.printStackTrace() | |
} | |
} | |
def html = new File(htmlDir, "${rtf.name}.html") | |
println "Writing output to $html" | |
html << sw.toString() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice! i got it to work in java too 👍 but i got a problem with the RTF Header.
There i got a charset with the font "Courier New" but it dose not appear in HTML
{\rtf1\ansi\ansicpg1252\deff0\deflang1031{\fonttbl{\f0\fswiss\fprq2\fcharset0 Courier New;}{\f1\fmodern\fprq1\fcharset0 Courier New;}}
{\colortbl ;\red0\green0\blue0;}