Skip to content

Instantly share code, notes, and snippets.

@youwi
Last active June 27, 2018 08:43
Show Gist options
  • Select an option

  • Save youwi/faa83492e16dbc8187e32c21c8ed4817 to your computer and use it in GitHub Desktop.

Select an option

Save youwi/faa83492e16dbc8187e32c21c8ed4817 to your computer and use it in GitHub Desktop.
docx to text , pure java ,without dep .Extract text from docx files
package com.qa.docx;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
public class DOCXContentReader {
private String DOCUMENT = "word/document.xml";
private String contentText = null;
public DOCXContentReader(String Filename) throws IOException {
this(new File(Filename));
}
public DOCXContentReader(File File) throws IOException {
this(new FileInputStream(File));
}
public DOCXContentReader(InputStream Stream) throws IOException {
ZipInputStream oZip = new ZipInputStream(Stream);
ZipEntry oEntry;
while ((oEntry = oZip.getNextEntry()) != null) {
if (oEntry.isDirectory())
continue;
if (oEntry.getName().compareToIgnoreCase(DOCUMENT) == 0) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream(8192);
while (true) {
int b = oZip.read();
if (b == -1)
break;
buffer.write(b);
}
contentText = parseAsText(buffer.toByteArray());
continue;
}
}
oZip.close();
}
/**
* get Text node ,get All text.
* only support Text node.
* not support color,font size,background,images.
* @param Contents
* @return String
*/
private String parseAsText(byte[] Contents) {
StringBuffer out = new StringBuffer();
try {
DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document dom = dBuilder.parse(new ByteArrayInputStream(Contents));
NodeList plist = dom.getElementsByTagName("w:p");
for (int current = 0; current < plist.getLength(); current++) {
Element sp = (Element) plist.item(current);
NodeList tlist = sp.getElementsByTagName("w:t");
for (int i = 0; i < tlist.getLength(); i++) {
Element tdom = (Element) tlist.item(i);
out.append(tdom.getFirstChild().getNodeValue());
}
out.append("\n");
}
return out.toString();
} catch (Exception ee) {
System.out.println(ee.getMessage());
}
return out.toString();
}
// usage
public String getContentText() {
return contentText;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment