Created
January 6, 2017 07:09
-
-
Save zhouhoo/51f1ff85d0bacb7446fb84a6bb516129 to your computer and use it in GitHub Desktop.
use tika to convert pdf file to txt.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//although it is hard job to convert pdf to text, tika tool is cool for this. it can auto detect pdf format and choose parser to parse the pdf. | |
package june; | |
import java.io.BufferedInputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.OutputStream; | |
import org.apache.tika.exception.TikaException; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.parser.AutoDetectParser; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.parser.Parser; | |
import org.apache.tika.sax.BodyContentHandler; | |
import org.xml.sax.ContentHandler; | |
import org.xml.sax.SAXException; | |
public class TikeParser { | |
public static void main(String[] args){ | |
InputStream is = null; | |
OutputStream out =null; | |
//String outPutFile="E:/notice/errorpdf/1.txt"; | |
String dirPathIn="C:/Users/Administrator/Desktop/finance_year/"; | |
String dirPathOut="E:/notice/finance_year_txt_s/"; | |
File dir = new File(dirPathIn); | |
String[] filenames = dir.list(); | |
int lenFiles = filenames.length; | |
try { | |
for(int i=0;i<lenFiles;i++){ | |
System.out.println(i+" : "+filenames[i]); | |
is = new BufferedInputStream(new FileInputStream(new File(dirPathIn+filenames[i]))); | |
out= new FileOutputStream(dirPathOut+i+".txt"); | |
Parser parser = new AutoDetectParser(); | |
ContentHandler handler = new BodyContentHandler(out); | |
Metadata metadata = new Metadata(); | |
parser.parse(is, handler, metadata, new ParseContext()); | |
} | |
// for (String name : metadata.names()) { | |
// String value = metadata.get(name); | |
// | |
// if (value != null) { | |
// System.out.println("Metadata Name: " + name); | |
// System.out.println("Metadata Value: " + value); | |
// } | |
// } | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} catch (TikaException e) { | |
e.printStackTrace(); | |
} catch (SAXException e) { | |
e.printStackTrace(); | |
} finally { | |
if (is != null) { | |
try { | |
is.close(); | |
out.close(); | |
} catch(IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment