Last active
December 9, 2021 10:33
-
-
Save billju/c55b75e91c774e0bb837fc00253068cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 下載 https://tesseract-ocr.github.io/tessdoc/Downloads.html | |
# 權重 https://github.com/tesseract-ocr/tessdata_best | |
# .box檔案格式:字 6 394 45 410 0 | |
import os | |
import shutil | |
from PIL import Image | |
from glob import glob | |
lang = 'chi_tra' | |
font = 'ocrb' | |
dist = 'tessdata' | |
# 切換資料夾 | |
if os.path.basename(os.getcwd()) is not dist: | |
os.makedirs(dist+'/jpg', exist_ok='OK') | |
os.chdir(dist) | |
# 轉檔 | |
for image in glob(f'*.png'): | |
Image.open(image).convert('RGB').save(image[:4]+'.jpg') | |
# 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr) | |
for image in glob('*.jpg'): | |
os.remove(image) | |
for i, image in enumerate(glob(f'jpg/*.jpg')): | |
train = f'{lang}.{font}.exp{i}' | |
shutil.copy(image, f'{train}.jpg') | |
os.system(f'tesseract -l {lang} {train}.jpg {train} batch.nochop makebox') | |
os.system(f'tesseract -l {lang} {train}.jpg {train} nobatch box.train') | |
boxes = ' '.join(glob(f'*.box')) | |
trs = ' '.join(glob(f'*.tr')) | |
# 製作字型屬性 | |
open(f'font_properties', 'w').write(f'{lang} 0 0 0 1 0') | |
os.system(f'unicharset_extractor --output_unicharset unicharset {boxes}') | |
os.system(f'mftraining -F font_properties -U unicharset -O {lang}.unicharset -D . {trs}') | |
os.system(f'cntraining -D . {trs}') | |
os.rename('inttemp', f'{lang}.inttemp') | |
os.rename('normproto', f'{lang}.normproto') | |
os.rename('pffmtable', f'{lang}.pffmtable') | |
os.rename('shapetable', f'{lang}.shapetable') | |
os.system(f'combine_tessdata {lang}.') | |
# 引擎 OCR_ENGINE_MODE | |
# 0 = 'Legacy' | |
# 1 = 'LSTM' | |
# 模式 PAGE_SEG_MODE | |
# 0 Orientation and script detection (OSD) only. | |
# 1 Automatic page segmentation with OSD. | |
# 2 Automatic page segmentation, but no OSD, or OCR. (not implemented) | |
# 3 Fully automatic page segmentation, but no OSD. (Default) | |
# 4 Assume a single column of text of variable sizes. | |
# 5 Assume a single uniform block of vertically aligned text. | |
# 6 Assume a single uniform block of text. | |
# 7 Treat the image as a single text line. | |
# 8 Treat the image as a single word. | |
# 9 Treat the image as a single word in a circle. | |
# 10 Treat the image as a single character. | |
# 11 Sparse text. Find as much text as possible in no particular order. | |
# 12 Sparse text with OSD. | |
# 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific. | |
# 格式(加在語言後) '' | 'hocr' | 'tsv' | |
os.system(f'tesseract {train}.jpg result -l {lang} --oem 1 --psm 3') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment