Skip to content

Instantly share code, notes, and snippets.

@lovasoa
Last active October 24, 2018 14:49
Show Gist options
  • Select an option

  • Save lovasoa/0aae8d38b3399e6387d4d321708cbc5e to your computer and use it in GitHub Desktop.

Select an option

Save lovasoa/0aae8d38b3399e6387d4d321708cbc5e to your computer and use it in GitHub Desktop.
package com.qwant;
import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
public class HtmlContentExtractor extends DefaultHandler {
final static SAXParserImpl parser;
static {
try {
parser = SAXParserImpl.newInstance(null);
} catch (SAXException | IOException e) {
throw new Error(e);
}
}
StringBuilder contents = new StringBuilder();
public static CharSequence parseText(String s) {
try {
InputStream stream = new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
HtmlContentExtractor myHandler = new HtmlContentExtractor();
synchronized (parser) {
parser.parse(stream, myHandler);
}
return myHandler.getContents();
} catch (SAXException | IOException e) {
return s;
}
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
if (localName.equals("br")) contents.append(' ');
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) {
contents.append(ch[start]);
}
@Override
public void characters(char[] ch, int start, int length) {
contents.append(ch, start, length);
}
CharSequence getContents() {
return contents;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment