Last active
June 2, 2023 09:51
-
-
Save Heilum/af7dcc1fa26762ea459648e4d6a68fd1 to your computer and use it in GitHub Desktop.
Apache Tika + Tesseract-OCR to scan Chinese text in pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<project xmlns="http://maven.apache.org/POM/4.0.0" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>com.swordfish.readpdf</groupId> | |
<artifactId>readpdf</artifactId> | |
<version>0.0.1</version> | |
<dependencies> | |
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox --> | |
<dependency> | |
<groupId>org.apache.pdfbox</groupId> | |
<artifactId>pdfbox</artifactId> | |
<version>2.0.10</version> | |
</dependency> | |
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers --> | |
<dependency> | |
<groupId>org.apache.tika</groupId> | |
<artifactId>tika-parsers</artifactId> | |
<version>1.18</version> | |
</dependency> | |
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core --> | |
<dependency> | |
<groupId>org.apache.tika</groupId> | |
<artifactId>tika-core</artifactId> | |
<version>1.18</version> | |
</dependency> | |
<dependency> | |
<groupId>com.levigo.jbig2</groupId> | |
<artifactId>levigo-jbig2-imageio</artifactId> | |
<version>1.6.5</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.pdfbox</groupId> | |
<artifactId>jbig2-imageio</artifactId> | |
<version>3.0.0</version> | |
</dependency> | |
<dependency> | |
<groupId>org.xerial</groupId> | |
<artifactId>sqlite-jdbc</artifactId> | |
<version>3.23.1</version> | |
</dependency> | |
<dependency> | |
<groupId>com.github.jai-imageio</groupId> | |
<artifactId>jai-imageio-core</artifactId> | |
<version>1.4.0</version> | |
</dependency> | |
<dependency> | |
<groupId>com.github.jai-imageio</groupId> | |
<artifactId>jai-imageio-jpeg2000</artifactId> | |
<version>1.3.0</version> | |
</dependency> | |
</dependencies> | |
</project> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.ByteArrayOutputStream; | |
import java.io.File; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.nio.charset.Charset; | |
import java.nio.file.Files; | |
import java.nio.file.Paths; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; | |
import org.apache.pdfbox.text.PDFTextStripper; | |
import org.apache.tika.Tika; | |
import org.apache.tika.config.TikaConfig; | |
import org.apache.tika.exception.TikaException; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.parser.AutoDetectParser; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.parser.Parser; | |
import org.apache.tika.parser.ocr.TesseractOCRConfig; | |
import org.apache.tika.parser.pdf.PDFParserConfig; | |
import org.apache.tika.sax.BodyContentHandler; | |
/** | |
1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki | |
2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the "tessdata" folder | |
3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/ | |
*/ | |
public class Main { | |
public static void main(String[] args) { | |
// TODO Auto-generated method stub | |
System.err.println(getTextFromTesseract("/work/projects/projects-2018/read_pdf2/vr.pdf")); | |
} | |
public static String getTextFromPdfByTika(String filePath) { | |
File file = new File(filePath); | |
String content; | |
try { | |
content = new Tika().parseToString(file); | |
return content; | |
} catch (IOException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (TikaException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
return ""; | |
} | |
public static String getTextFromTesseract(String filePath) { | |
try { | |
InputStream pdf = Files.newInputStream(Paths.get(filePath)); | |
ByteArrayOutputStream out = new ByteArrayOutputStream(); | |
TikaConfig config = TikaConfig.getDefaultConfig(); | |
// TikaConfig fromFile = new TikaConfig("/path/to/file"); | |
BodyContentHandler handler = new BodyContentHandler(out); | |
Parser parser = new AutoDetectParser(config); | |
Metadata meta = new Metadata(); | |
ParseContext parsecontext = new ParseContext(); | |
PDFParserConfig pdfConfig = new PDFParserConfig(); | |
pdfConfig.setExtractInlineImages(true); | |
TesseractOCRConfig tesserConfig = new TesseractOCRConfig(); | |
tesserConfig.setLanguage("chi_sim"); | |
tesserConfig.setTesseractPath("/usr/local/Cellar/tesseract/3.05.01/bin"); | |
//把chi_sim.traineddata放置在tessdata目录下 | |
tesserConfig.setTessdataPath("/usr/local/Cellar/tesseract/3.05.01/share/tessdata"); | |
parsecontext.set(Parser.class, parser); | |
parsecontext.set(PDFParserConfig.class, pdfConfig); | |
parsecontext.set(TesseractOCRConfig.class, tesserConfig); | |
parser.parse(pdf, handler, meta, parsecontext); | |
String s = new String(out.toByteArray(),Charset.defaultCharset()); | |
return s; | |
} catch (Exception e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
return ""; | |
} | |
} | |
/** | |
* | |
* @Title: getTextFromPdf | |
* @Description: 读取pdf文件内容 | |
* @param filePath | |
* @return: 读出的pdf的内容 | |
*/ | |
public static String getTextFromPdf(String filePath) { | |
PDDocument pdDoc; | |
try { | |
pdDoc = PDDocument.load(new File(filePath)); | |
PDFTextStripper pdfStripper = new PDFTextStripper(); | |
String result = pdfStripper.getText(pdDoc); | |
return result; | |
} catch (InvalidPasswordException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} catch (IOException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
return ""; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment