Skip to content

Instantly share code, notes, and snippets.

@qlong8807
Last active June 9, 2023 09:19
Show Gist options
  • Save qlong8807/6b3339e17cf056d769ff8d031b5c85dc to your computer and use it in GitHub Desktop.
Save qlong8807/6b3339e17cf056d769ff8d031b5c85dc to your computer and use it in GitHub Desktop.
使用Tess4j进行中英文识别。OCR识别。图片转文字。
共三步:
1. springboot-maven项目,引入依赖:
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>5.3.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.4.0</version>
<exclusions>
<exclusion>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
</exclusion>
</exclusions>
</dependency>
//2. 如果要识别中文,则需要该步骤。在项目根目录下创建tessdata文件夹(与src同级),文件夹下放入chi_sim.traineddata文件(可从https://github.com/qlong8807/tessdata获取)即可。
//3. java代码
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.Tesseract1;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.util.LoadLibs;
import java.io.*;
public class Tess4jUtils {
/**
* 识别英文和数字
* @param path 图片路径
* @return 识别后的结果
*/
public static String readChar(String path){
// JNA Interface Mapping
// ITesseract instance = new Tesseract();
// JNA Direct Mapping 这两个实例都可以使用
ITesseract instance = new Tesseract1();
File imageFile = new File(path);
//这样就能使用classpath目录下的训练库了
File tessDataFolder = LoadLibs.extractTessResources("tessdata");
//Set the tessdata path
instance.setDatapath(tessDataFolder.getAbsolutePath());
//英文库识别数字比较准确
instance.setLanguage(Const.ENG);
return getOCRText(instance, imageFile);
}
/**
* 识别中文和数字,混合的中英文识别不是很准确
* @param path 图片路径
* @param dataPath 训练库路径
* @param language 语言字库
* @return 识别结果
*/
public static String readChar(String path, String dataPath, String language){
File imageFile = new File(path);
ITesseract instance = new Tesseract();
instance.setDatapath(dataPath);
//英文库识别数字比较准确
instance.setLanguage(language);
return getOCRText(instance, imageFile);
}
/**
* 识别图片文件中的文字
* @param instance
* @param imageFile
* @return
*/
private static String getOCRText(ITesseract instance, File imageFile){
String result = null;
try {
result = instance.doOCR(imageFile);
} catch (TesseractException e) {
e.printStackTrace();
}
return result;
}
public static void main(String[] args) {
long start = System.currentTimeMillis();
// String filePath = "E:\\idea_workspace\\plb\\plbbr\\src\\main\\resources\\image\\w.png";
// System.out.println(readChar(filePath));
// System.err.println(System.currentTimeMillis() - start);
String filePath2 = "E:\\idea_workspace\\plb\\plbbr\\src\\main\\resources\\image\\cen.png";
String trainedDataPath2 = "E:\\idea_workspace\\plb\\plbbr\\tessdata";//该目录下只有chi_sim.traineddata
System.out.println(readChar(filePath2, trainedDataPath2, Const.CHI_SIM));
System.err.println("中文识别:\n"+(System.currentTimeMillis() - start));
}
}
class Const{
public static final String CHI_SIM = "chi_sim";
public static final String ENG = "eng";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment