Last active
June 9, 2023 09:19
-
-
Save qlong8807/6b3339e17cf056d769ff8d031b5c85dc to your computer and use it in GitHub Desktop.
使用Tess4j进行中英文识别。OCR识别。图片转文字。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
共三步: | |
1. springboot-maven项目,引入依赖: | |
<dependency> | |
<groupId>com.github.jai-imageio</groupId> | |
<artifactId>jai-imageio-core</artifactId> | |
<version>1.4.0</version> | |
</dependency> | |
<dependency> | |
<groupId>net.java.dev.jna</groupId> | |
<artifactId>jna</artifactId> | |
<version>5.3.1</version> | |
</dependency> | |
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j --> | |
<dependency> | |
<groupId>net.sourceforge.tess4j</groupId> | |
<artifactId>tess4j</artifactId> | |
<version>4.4.0</version> | |
<exclusions> | |
<exclusion> | |
<groupId>net.java.dev.jna</groupId> | |
<artifactId>jna</artifactId> | |
</exclusion> | |
</exclusions> | |
</dependency> | |
//2. 如果要识别中文,则需要该步骤。在项目根目录下创建tessdata文件夹(与src同级),文件夹下放入chi_sim.traineddata文件(可从https://github.com/qlong8807/tessdata获取)即可。 | |
//3. java代码 | |
import net.sourceforge.tess4j.ITesseract; | |
import net.sourceforge.tess4j.Tesseract; | |
import net.sourceforge.tess4j.Tesseract1; | |
import net.sourceforge.tess4j.TesseractException; | |
import net.sourceforge.tess4j.util.LoadLibs; | |
import java.io.*; | |
public class Tess4jUtils { | |
/** | |
* 识别英文和数字 | |
* @param path 图片路径 | |
* @return 识别后的结果 | |
*/ | |
public static String readChar(String path){ | |
// JNA Interface Mapping | |
// ITesseract instance = new Tesseract(); | |
// JNA Direct Mapping 这两个实例都可以使用 | |
ITesseract instance = new Tesseract1(); | |
File imageFile = new File(path); | |
//这样就能使用classpath目录下的训练库了 | |
File tessDataFolder = LoadLibs.extractTessResources("tessdata"); | |
//Set the tessdata path | |
instance.setDatapath(tessDataFolder.getAbsolutePath()); | |
//英文库识别数字比较准确 | |
instance.setLanguage(Const.ENG); | |
return getOCRText(instance, imageFile); | |
} | |
/** | |
* 识别中文和数字,混合的中英文识别不是很准确 | |
* @param path 图片路径 | |
* @param dataPath 训练库路径 | |
* @param language 语言字库 | |
* @return 识别结果 | |
*/ | |
public static String readChar(String path, String dataPath, String language){ | |
File imageFile = new File(path); | |
ITesseract instance = new Tesseract(); | |
instance.setDatapath(dataPath); | |
//英文库识别数字比较准确 | |
instance.setLanguage(language); | |
return getOCRText(instance, imageFile); | |
} | |
/** | |
* 识别图片文件中的文字 | |
* @param instance | |
* @param imageFile | |
* @return | |
*/ | |
private static String getOCRText(ITesseract instance, File imageFile){ | |
String result = null; | |
try { | |
result = instance.doOCR(imageFile); | |
} catch (TesseractException e) { | |
e.printStackTrace(); | |
} | |
return result; | |
} | |
public static void main(String[] args) { | |
long start = System.currentTimeMillis(); | |
// String filePath = "E:\\idea_workspace\\plb\\plbbr\\src\\main\\resources\\image\\w.png"; | |
// System.out.println(readChar(filePath)); | |
// System.err.println(System.currentTimeMillis() - start); | |
String filePath2 = "E:\\idea_workspace\\plb\\plbbr\\src\\main\\resources\\image\\cen.png"; | |
String trainedDataPath2 = "E:\\idea_workspace\\plb\\plbbr\\tessdata";//该目录下只有chi_sim.traineddata | |
System.out.println(readChar(filePath2, trainedDataPath2, Const.CHI_SIM)); | |
System.err.println("中文识别:\n"+(System.currentTimeMillis() - start)); | |
} | |
} | |
class Const{ | |
public static final String CHI_SIM = "chi_sim"; | |
public static final String ENG = "eng"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment