Heilum · June 2, 2023 09:51
diff --git a/pom.xml b/pom.xml
 <project xmlns="http://maven.apache.org/POM/4.0.0"
 	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 	<modelVersion>4.0.0</modelVersion>
 	<groupId>com.swordfish.readpdf</groupId>
 	<artifactId>readpdf</artifactId>
 	<version>0.0.1</version>
 	<dependencies>
 		<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
 		<dependency>
 			<groupId>org.apache.pdfbox</groupId>
 			<artifactId>pdfbox</artifactId>
 			<version>2.0.10</version>
 		</dependency>

 		<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
 		<dependency>
 			<groupId>org.apache.tika</groupId>
 			<artifactId>tika-parsers</artifactId>
 			<version>1.18</version>
 		</dependency>

 		<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
 		<dependency>
 			<groupId>org.apache.tika</groupId>
 			<artifactId>tika-core</artifactId>
 			<version>1.18</version>
 		</dependency>

 		<dependency>
 			<groupId>com.levigo.jbig2</groupId>
 			<artifactId>levigo-jbig2-imageio</artifactId>
 			<version>1.6.5</version>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.pdfbox</groupId>
 			<artifactId>jbig2-imageio</artifactId>
 			<version>3.0.0</version>
 		</dependency>

 		<dependency>
 			<groupId>org.xerial</groupId>
 			<artifactId>sqlite-jdbc</artifactId>
 			<version>3.23.1</version>
 		</dependency>


 		<dependency>
 			<groupId>com.github.jai-imageio</groupId>
 			<artifactId>jai-imageio-core</artifactId>
 			<version>1.4.0</version>
 		</dependency>
 		<dependency>
 			<groupId>com.github.jai-imageio</groupId>
 			<artifactId>jai-imageio-jpeg2000</artifactId>
 			<version>1.3.0</version>
 		</dependency>

 	</dependencies>
 </project>
diff --git a/Test.java b/Test.java


 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Paths;

 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;

 /**
 1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki
 2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the "tessdata" folder
 3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/
 */

 public class Main {

 	public static void main(String[] args) {
 		// TODO Auto-generated method stub
  
 		System.err.println(getTextFromTesseract("/work/projects/projects-2018/read_pdf2/vr.pdf"));
 	}
 	
 	
 	public static String getTextFromPdfByTika(String filePath) {
 		  File file = new File(filePath);
          String content;
 		try {
 			content = new Tika().parseToString(file);
 			return content;
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		} catch (TikaException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 		return "";
        
 	}
 	
 	public static String getTextFromTesseract(String filePath) {
 		try {
 			InputStream pdf = Files.newInputStream(Paths.get(filePath));
 			ByteArrayOutputStream out = new ByteArrayOutputStream();
 			 
 			TikaConfig config = TikaConfig.getDefaultConfig();
 			// TikaConfig fromFile = new TikaConfig("/path/to/file");
 			BodyContentHandler handler = new BodyContentHandler(out);
 			
 			Parser parser = new AutoDetectParser(config);
 			Metadata meta = new Metadata();
 			ParseContext parsecontext = new ParseContext();
 			
 			
 			PDFParserConfig pdfConfig = new PDFParserConfig();
 			pdfConfig.setExtractInlineImages(true);
 			 
 			TesseractOCRConfig tesserConfig = new TesseractOCRConfig();
 			tesserConfig.setLanguage("chi_sim");
 			tesserConfig.setTesseractPath("/usr/local/Cellar/tesseract/3.05.01/bin");
 			//把chi_sim.traineddata放置在tessdata目录下
 			tesserConfig.setTessdataPath("/usr/local/Cellar/tesseract/3.05.01/share/tessdata");
 			 
 			parsecontext.set(Parser.class, parser);
 			parsecontext.set(PDFParserConfig.class, pdfConfig);
 			parsecontext.set(TesseractOCRConfig.class, tesserConfig);
 			
 			parser.parse(pdf, handler, meta, parsecontext);
 			
 			String s = new String(out.toByteArray(),Charset.defaultCharset());

 			return s;
 			
 		} catch (Exception e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 			return "";
 		}
      
 	}
 	
 	/**
 	 * 
 	 * @Title: getTextFromPdf
 	 * @Description: 读取pdf文件内容
 	 * @param filePath
 	 * @return: 读出的pdf的内容
 	 */
 	public static String getTextFromPdf(String filePath) {
 		
 		
 		PDDocument pdDoc;
 		try {
 			pdDoc = PDDocument.load(new File(filePath));
 			PDFTextStripper pdfStripper = new PDFTextStripper();
 			
 			String result = pdfStripper.getText(pdDoc);
 			
 			return result;
 		} catch (InvalidPasswordException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 		
 		return "";
 	
 	}

 }
	<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.swordfish.readpdf</groupId>
	<artifactId>readpdf</artifactId>
	<version>0.0.1</version>
	<dependencies>
	<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
	<dependency>
	<groupId>org.apache.pdfbox</groupId>
	<artifactId>pdfbox</artifactId>
	<version>2.0.10</version>
	</dependency>

	<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
	<dependency>
	<groupId>org.apache.tika</groupId>
	<artifactId>tika-parsers</artifactId>
	<version>1.18</version>
	</dependency>

	<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
	<dependency>
	<groupId>org.apache.tika</groupId>
	<artifactId>tika-core</artifactId>
	<version>1.18</version>
	</dependency>

	<dependency>
	<groupId>com.levigo.jbig2</groupId>
	<artifactId>levigo-jbig2-imageio</artifactId>
	<version>1.6.5</version>
	</dependency>
	<dependency>
	<groupId>org.apache.pdfbox</groupId>
	<artifactId>jbig2-imageio</artifactId>
	<version>3.0.0</version>
	</dependency>

	<dependency>
	<groupId>org.xerial</groupId>
	<artifactId>sqlite-jdbc</artifactId>
	<version>3.23.1</version>
	</dependency>


	<dependency>
	<groupId>com.github.jai-imageio</groupId>
	<artifactId>jai-imageio-core</artifactId>
	<version>1.4.0</version>
	</dependency>
	<dependency>
	<groupId>com.github.jai-imageio</groupId>
	<artifactId>jai-imageio-jpeg2000</artifactId>
	<version>1.3.0</version>
	</dependency>

	</dependencies>
	</project>


	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.IOException;
	import java.io.InputStream;
	import java.nio.charset.Charset;
	import java.nio.file.Files;
	import java.nio.file.Paths;

	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
	import org.apache.pdfbox.text.PDFTextStripper;
	import org.apache.tika.Tika;
	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.parser.ocr.TesseractOCRConfig;
	import org.apache.tika.parser.pdf.PDFParserConfig;
	import org.apache.tika.sax.BodyContentHandler;

	/**
	1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki
	2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the "tessdata" folder
	3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/
	*/

	public class Main {

	public static void main(String[] args) {
	// TODO Auto-generated method stub

	System.err.println(getTextFromTesseract("/work/projects/projects-2018/read_pdf2/vr.pdf"));
	}


	public static String getTextFromPdfByTika(String filePath) {
	File file = new File(filePath);
	String content;
	try {
	content = new Tika().parseToString(file);
	return content;
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	} catch (TikaException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	return "";

	}

	public static String getTextFromTesseract(String filePath) {
	try {
	InputStream pdf = Files.newInputStream(Paths.get(filePath));
	ByteArrayOutputStream out = new ByteArrayOutputStream();

	TikaConfig config = TikaConfig.getDefaultConfig();
	// TikaConfig fromFile = new TikaConfig("/path/to/file");
	BodyContentHandler handler = new BodyContentHandler(out);

	Parser parser = new AutoDetectParser(config);
	Metadata meta = new Metadata();
	ParseContext parsecontext = new ParseContext();


	PDFParserConfig pdfConfig = new PDFParserConfig();
	pdfConfig.setExtractInlineImages(true);

	TesseractOCRConfig tesserConfig = new TesseractOCRConfig();
	tesserConfig.setLanguage("chi_sim");
	tesserConfig.setTesseractPath("/usr/local/Cellar/tesseract/3.05.01/bin");
	//把chi_sim.traineddata放置在tessdata目录下
	tesserConfig.setTessdataPath("/usr/local/Cellar/tesseract/3.05.01/share/tessdata");

	parsecontext.set(Parser.class, parser);
	parsecontext.set(PDFParserConfig.class, pdfConfig);
	parsecontext.set(TesseractOCRConfig.class, tesserConfig);

	parser.parse(pdf, handler, meta, parsecontext);

	String s = new String(out.toByteArray(),Charset.defaultCharset());

	return s;

	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	return "";
	}

	}

	/**
	*
	* @Title: getTextFromPdf
	* @Description: 读取pdf文件内容
	* @param filePath
	* @return: 读出的pdf的内容
	*/
	public static String getTextFromPdf(String filePath) {


	PDDocument pdDoc;
	try {
	pdDoc = PDDocument.load(new File(filePath));
	PDFTextStripper pdfStripper = new PDFTextStripper();

	String result = pdfStripper.getText(pdDoc);

	return result;
	} catch (InvalidPasswordException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}

	return "";

	}

	}