youwi · June 27, 2018 08:43
diff --git a/DocxToText.java b/DocxToText.java
 package com.qa.docx;

 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import java.io.*;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;

 public class DOCXContentReader {
    private String DOCUMENT = "word/document.xml";
    private String contentText = null;

    public DOCXContentReader(String Filename) throws IOException {
        this(new File(Filename));
    }

    public DOCXContentReader(File File) throws IOException {
        this(new FileInputStream(File));
    }


    public DOCXContentReader(InputStream Stream) throws IOException {
        ZipInputStream oZip = new ZipInputStream(Stream);

        ZipEntry oEntry;
        while ((oEntry = oZip.getNextEntry()) != null) {
            if (oEntry.isDirectory())
                continue;

            if (oEntry.getName().compareToIgnoreCase(DOCUMENT) == 0) {
                ByteArrayOutputStream buffer = new ByteArrayOutputStream(8192);
                while (true) {
                    int b = oZip.read();
                    if (b == -1)
                        break;
                    buffer.write(b);
                }
                contentText = parseAsText(buffer.toByteArray());
                continue;
            }
        }
        oZip.close();
    }

    /**
     * get Text node ,get All text.
     *  only support Text node.
     *  not support  color,font size,background,images.
     * @param Contents
     * @return String
     */
    private String parseAsText(byte[] Contents) {
        StringBuffer out = new StringBuffer();

        try {
            DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            Document dom = dBuilder.parse(new ByteArrayInputStream(Contents));

            NodeList plist = dom.getElementsByTagName("w:p");
            for (int current = 0; current < plist.getLength(); current++) {
                Element sp = (Element) plist.item(current);
                NodeList tlist = sp.getElementsByTagName("w:t");
                for (int i = 0; i < tlist.getLength(); i++) {
                    Element tdom = (Element) tlist.item(i);
                    out.append(tdom.getFirstChild().getNodeValue());
                }
                out.append("\n");
            }
            return out.toString();
        } catch (Exception ee) {
            System.out.println(ee.getMessage());
        }
        return out.toString();
    }

    // usage
    public String getContentText() {
        return contentText;
    }
 }
	package com.qa.docx;

	import org.w3c.dom.Document;
	import org.w3c.dom.Element;
	import org.w3c.dom.NodeList;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;
	import java.io.*;
	import java.util.zip.ZipEntry;
	import java.util.zip.ZipInputStream;

	public class DOCXContentReader {
	private String DOCUMENT = "word/document.xml";
	private String contentText = null;

	public DOCXContentReader(String Filename) throws IOException {
	this(new File(Filename));
	}

	public DOCXContentReader(File File) throws IOException {
	this(new FileInputStream(File));
	}


	public DOCXContentReader(InputStream Stream) throws IOException {
	ZipInputStream oZip = new ZipInputStream(Stream);

	ZipEntry oEntry;
	while ((oEntry = oZip.getNextEntry()) != null) {
	if (oEntry.isDirectory())
	continue;

	if (oEntry.getName().compareToIgnoreCase(DOCUMENT) == 0) {
	ByteArrayOutputStream buffer = new ByteArrayOutputStream(8192);
	while (true) {
	int b = oZip.read();
	if (b == -1)
	break;
	buffer.write(b);
	}
	contentText = parseAsText(buffer.toByteArray());
	continue;
	}
	}
	oZip.close();
	}

	/**
	* get Text node ,get All text.
	* only support Text node.
	* not support color,font size,background,images.
	* @param Contents
	* @return String
	*/
	private String parseAsText(byte[] Contents) {
	StringBuffer out = new StringBuffer();

	try {
	DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
	Document dom = dBuilder.parse(new ByteArrayInputStream(Contents));

	NodeList plist = dom.getElementsByTagName("w:p");
	for (int current = 0; current < plist.getLength(); current++) {
	Element sp = (Element) plist.item(current);
	NodeList tlist = sp.getElementsByTagName("w:t");
	for (int i = 0; i < tlist.getLength(); i++) {
	Element tdom = (Element) tlist.item(i);
	out.append(tdom.getFirstChild().getNodeValue());
	}
	out.append("\n");
	}
	return out.toString();
	} catch (Exception ee) {
	System.out.println(ee.getMessage());
	}
	return out.toString();
	}

	// usage
	public String getContentText() {
	return contentText;
	}
	}
No results found