Skip to content

Instantly share code, notes, and snippets.

@yoshi0309
Created May 19, 2014 01:26
Show Gist options
  • Save yoshi0309/bd93cb7b1dda6536ada0 to your computer and use it in GitHub Desktop.
Save yoshi0309/bd93cb7b1dda6536ada0 to your computer and use it in GitHub Desktop.
Apache Tika 1.5 - AutoDetectParser example
package yoshida.tika_sample;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args)
{
System.out.println( "Hello World!" );
String target = "XXX.pdf";
File document = new File(target);
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try {
parser.parse(new FileInputStream(document), handler, metadata, new ParseContext());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
}
System.out.println(metadata);
System.out.println(handler.toString());
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>yoshida</groupId>
<artifactId>tika-sample</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>tika-sample</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.5</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.5</version>
</dependency>
</dependencies>
</project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment