Skip to content

Instantly share code, notes, and snippets.

@derlin
Last active December 22, 2019 07:56
Show Gist options
  • Save derlin/785cfdbe442c7dd0e55368a7048a7001 to your computer and use it in GitHub Desktop.
Save derlin/785cfdbe442c7dd0e55368a7048a7001 to your computer and use it in GitHub Desktop.
Scraping HTML: Apache Tika + Boilerpipe to extract sentences and links from webpages.

This is an example on how to use Apache Tika and the boilerpipe extractor to get article sentences and links from an HTML page.

import org.apache.tika.exception.TikaException;
import org.apache.tika.sax.Link;
import org.xml.sax.SAXException;
import java.io.IOException;
public class ExtractTextAndLinksExample {
static final String url = "http://www.flagsarenotlanguages.com/blog/why-flags-do-not-represent-language/";
public static void main(String[] args) throws TikaException, IOException, SAXException {
HtmlScraper.Results results = HtmlScraper.scrape(url);
System.out.println(results);
System.out.println("\nTextBlocks\n=================");
results.textBlocks.forEach(System.out::println);
System.out.println("\nLinks\n=================");
results.links.stream().map(Link::getUri).forEach(System.out::println);
}
}
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.extractors.ArticleSentencesExtractor;
import de.l3s.boilerpipe.extractors.ExtractorBase;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.Link;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.SAXException;
import javax.net.ssl.*;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
public class HtmlScraper {
private static final ExtractorBase extractor = ArticleSentencesExtractor.getInstance();
static {
// avoid SSL certificate errors
trustAllHttpsCertificates();
}
public static class Results {
public String url;
public String title;
public List<String> textBlocks;
public List<Link> links;
Results() {
}
@Override
public String toString() {
return String.format("Results{ %s <%s>: %d links, %d textBlocks }",
title, url, links.size(), textBlocks.size());
}
}
public static Results scrape(String url) throws IOException, TikaException, SAXException {
// fetch HTML
InputStream input = new URL(url).openStream();
// setup extractors
LinkContentHandler linkHandler = new LinkContentHandler();
BoilerpipeContentHandler textHandler =
new BoilerpipeContentHandler(new BodyContentHandler(), extractor);
// parse using boilerpipe+tika
Metadata metadata = new Metadata();
HtmlParser parser = new HtmlParser();
parser.parse(input,
new TeeContentHandler(linkHandler, textHandler),
metadata,
new ParseContext());
// gather results
Results results = new Results();
results.url = url;
results.title = metadata.get("title");
List<TextBlock> blocks = textHandler.getTextDocument().getTextBlocks();
results.textBlocks = blocks.stream()
//.filter(TextBlock::isContent)
.map(TextBlock::getText)
.collect(Collectors.toList());
// keep only <a> tags with non-empty href + avoid duplicates
results.links = linkHandler.getLinks().stream()
.filter(l -> l.isAnchor() &&
!l.getUri().isEmpty() &&
!l.getUri().startsWith("#"))
.filter(distinctBy(Link::getUri))
.collect(Collectors.toList());
return results;
}
// ---------------------------------------------------
// utilities
// distinct by property in a Java stream
private static <T> Predicate<T> distinctBy(Function<? super T, ?> keyExtractor) {
final Set<Object> seen = new HashSet<>();
return t -> seen.add(keyExtractor.apply(t));
}
// Make the URL#getConnection accept unsecure HTTPS certificates as well to avoid
// `java.net.ssl.SSLHandshakeException`. Just call it once per JVM.
// see http://www.rgagnon.com/javadetails/java-fix-certificate-problem-in-HTTPS.html
// for more details
private static void trustAllHttpsCertificates() {
try {
TrustManager[] dummyTrustManagers = new TrustManager[]{new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {}
public void checkServerTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {}
public X509Certificate[] getAcceptedIssuers() { return null; }
}};
SSLContext sc = SSLContext.getInstance("SSL");
sc.init(null, dummyTrustManagers, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
// Create and install all-trusting host name verifier
HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
public boolean verify(String s, SSLSession sslSession) { return true; }
});
} catch (NoSuchAlgorithmException | KeyManagementException e) {
System.out.println("Error setting up dummy certificate: " + e);
}
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>ch.derlin</groupId>
<artifactId>tika-example</artifactId>
<version>0.1</version>
<properties>
<tika.version>1.18</tika.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${tika.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment