Last active
October 24, 2018 14:49
-
-
Save lovasoa/0aae8d38b3399e6387d4d321708cbc5e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package com.qwant; | |
| import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl; | |
| import org.xml.sax.Attributes; | |
| import org.xml.sax.SAXException; | |
| import org.xml.sax.helpers.DefaultHandler; | |
| import java.io.ByteArrayInputStream; | |
| import java.io.IOException; | |
| import java.io.InputStream; | |
| import java.nio.charset.StandardCharsets; | |
| public class HtmlContentExtractor extends DefaultHandler { | |
| final static SAXParserImpl parser; | |
| static { | |
| try { | |
| parser = SAXParserImpl.newInstance(null); | |
| } catch (SAXException | IOException e) { | |
| throw new Error(e); | |
| } | |
| } | |
| StringBuilder contents = new StringBuilder(); | |
| public static CharSequence parseText(String s) { | |
| try { | |
| InputStream stream = new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)); | |
| HtmlContentExtractor myHandler = new HtmlContentExtractor(); | |
| synchronized (parser) { | |
| parser.parse(stream, myHandler); | |
| } | |
| return myHandler.getContents(); | |
| } catch (SAXException | IOException e) { | |
| return s; | |
| } | |
| } | |
| @Override | |
| public void startElement(String uri, String localName, String qName, Attributes attributes) { | |
| if (localName.equals("br")) contents.append(' '); | |
| } | |
| @Override | |
| public void ignorableWhitespace(char[] ch, int start, int length) { | |
| contents.append(ch[start]); | |
| } | |
| @Override | |
| public void characters(char[] ch, int start, int length) { | |
| contents.append(ch, start, length); | |
| } | |
| CharSequence getContents() { | |
| return contents; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment