Last active
June 27, 2018 08:43
-
-
Save youwi/faa83492e16dbc8187e32c21c8ed4817 to your computer and use it in GitHub Desktop.
docx to text , pure java ,without dep .Extract text from docx files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package com.qa.docx; | |
| import org.w3c.dom.Document; | |
| import org.w3c.dom.Element; | |
| import org.w3c.dom.NodeList; | |
| import javax.xml.parsers.DocumentBuilder; | |
| import javax.xml.parsers.DocumentBuilderFactory; | |
| import java.io.*; | |
| import java.util.zip.ZipEntry; | |
| import java.util.zip.ZipInputStream; | |
| public class DOCXContentReader { | |
| private String DOCUMENT = "word/document.xml"; | |
| private String contentText = null; | |
| public DOCXContentReader(String Filename) throws IOException { | |
| this(new File(Filename)); | |
| } | |
| public DOCXContentReader(File File) throws IOException { | |
| this(new FileInputStream(File)); | |
| } | |
| public DOCXContentReader(InputStream Stream) throws IOException { | |
| ZipInputStream oZip = new ZipInputStream(Stream); | |
| ZipEntry oEntry; | |
| while ((oEntry = oZip.getNextEntry()) != null) { | |
| if (oEntry.isDirectory()) | |
| continue; | |
| if (oEntry.getName().compareToIgnoreCase(DOCUMENT) == 0) { | |
| ByteArrayOutputStream buffer = new ByteArrayOutputStream(8192); | |
| while (true) { | |
| int b = oZip.read(); | |
| if (b == -1) | |
| break; | |
| buffer.write(b); | |
| } | |
| contentText = parseAsText(buffer.toByteArray()); | |
| continue; | |
| } | |
| } | |
| oZip.close(); | |
| } | |
| /** | |
| * get Text node ,get All text. | |
| * only support Text node. | |
| * not support color,font size,background,images. | |
| * @param Contents | |
| * @return String | |
| */ | |
| private String parseAsText(byte[] Contents) { | |
| StringBuffer out = new StringBuffer(); | |
| try { | |
| DocumentBuilder dBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); | |
| Document dom = dBuilder.parse(new ByteArrayInputStream(Contents)); | |
| NodeList plist = dom.getElementsByTagName("w:p"); | |
| for (int current = 0; current < plist.getLength(); current++) { | |
| Element sp = (Element) plist.item(current); | |
| NodeList tlist = sp.getElementsByTagName("w:t"); | |
| for (int i = 0; i < tlist.getLength(); i++) { | |
| Element tdom = (Element) tlist.item(i); | |
| out.append(tdom.getFirstChild().getNodeValue()); | |
| } | |
| out.append("\n"); | |
| } | |
| return out.toString(); | |
| } catch (Exception ee) { | |
| System.out.println(ee.getMessage()); | |
| } | |
| return out.toString(); | |
| } | |
| // usage | |
| public String getContentText() { | |
| return contentText; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment