billju · December 9, 2021 10:33
diff --git a/tesseract_ocr_train.py b/tesseract_ocr_train.py
 # 下載 https://tesseract-ocr.github.io/tessdoc/Downloads.html
 # 權重 https://github.com/tesseract-ocr/tessdata_best
 # .box檔案格式：字 6 394 45 410 0
 import os
 import shutil
 from PIL import Image
 from glob import glob
 lang = 'chi_tra'
 font = 'ocrb'
 dist = 'tessdata'
 # 切換資料夾
 if os.path.basename(os.getcwd()) is not dist:
    os.makedirs(dist+'/jpg', exist_ok='OK')
    os.chdir(dist)
 # 轉檔
 for image in glob(f'*.png'):
    Image.open(image).convert('RGB').save(image[:4]+'.jpg')
 # 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
 for image in glob('*.jpg'):
    os.remove(image)
 for i, image in enumerate(glob(f'jpg/*.jpg')):
    train = f'{lang}.{font}.exp{i}'
    shutil.copy(image, f'{train}.jpg')
    os.system(f'tesseract -l {lang} {train}.jpg {train} batch.nochop makebox')
    os.system(f'tesseract -l {lang} {train}.jpg {train} nobatch box.train')
 boxes = ' '.join(glob(f'*.box'))
 trs = ' '.join(glob(f'*.tr'))
 # 製作字型屬性
 open(f'font_properties', 'w').write(f'{lang} 0 0 0 1 0')
 os.system(f'unicharset_extractor --output_unicharset unicharset {boxes}')
 os.system(f'mftraining -F font_properties -U unicharset -O {lang}.unicharset -D . {trs}')
 os.system(f'cntraining -D . {trs}')
 os.rename('inttemp', f'{lang}.inttemp')
 os.rename('normproto', f'{lang}.normproto')
 os.rename('pffmtable', f'{lang}.pffmtable')
 os.rename('shapetable', f'{lang}.shapetable')
 os.system(f'combine_tessdata {lang}.')

 # 引擎 OCR_ENGINE_MODE
 # 0 = 'Legacy'
 # 1 = 'LSTM'

 # 模式 PAGE_SEG_MODE
 # 0  Orientation and script detection (OSD) only.
 # 1  Automatic page segmentation with OSD.
 # 2  Automatic page segmentation, but no OSD, or OCR. (not implemented)
 # 3  Fully automatic page segmentation, but no OSD. (Default)
 # 4  Assume a single column of text of variable sizes.
 # 5  Assume a single uniform block of vertically aligned text.
 # 6  Assume a single uniform block of text.
 # 7  Treat the image as a single text line.
 # 8  Treat the image as a single word.
 # 9  Treat the image as a single word in a circle.
 # 10 Treat the image as a single character.
 # 11 Sparse text. Find as much text as possible in no particular order.
 # 12 Sparse text with OSD.
 # 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.

 # 格式(加在語言後) '' | 'hocr' | 'tsv'
 os.system(f'tesseract {train}.jpg result -l {lang} --oem 1 --psm 3')
	# 下載 https://tesseract-ocr.github.io/tessdoc/Downloads.html
	# 權重 https://github.com/tesseract-ocr/tessdata_best
	# .box檔案格式：字 6 394 45 410 0
	import os
	import shutil
	from PIL import Image
	from glob import glob
	lang = 'chi_tra'
	font = 'ocrb'
	dist = 'tessdata'
	# 切換資料夾
	if os.path.basename(os.getcwd()) is not dist:
	os.makedirs(dist+'/jpg', exist_ok='OK')
	os.chdir(dist)
	# 轉檔
	for image in glob(f'*.png'):
	Image.open(image).convert('RGB').save(image[:4]+'.jpg')
	# 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
	for image in glob('*.jpg'):
	os.remove(image)
	for i, image in enumerate(glob(f'jpg/*.jpg')):
	train = f'{lang}.{font}.exp{i}'
	shutil.copy(image, f'{train}.jpg')
	os.system(f'tesseract -l {lang} {train}.jpg {train} batch.nochop makebox')
	os.system(f'tesseract -l {lang} {train}.jpg {train} nobatch box.train')
	boxes = ' '.join(glob(f'*.box'))
	trs = ' '.join(glob(f'*.tr'))
	# 製作字型屬性
	open(f'font_properties', 'w').write(f'{lang} 0 0 0 1 0')
	os.system(f'unicharset_extractor --output_unicharset unicharset {boxes}')
	os.system(f'mftraining -F font_properties -U unicharset -O {lang}.unicharset -D . {trs}')
	os.system(f'cntraining -D . {trs}')
	os.rename('inttemp', f'{lang}.inttemp')
	os.rename('normproto', f'{lang}.normproto')
	os.rename('pffmtable', f'{lang}.pffmtable')
	os.rename('shapetable', f'{lang}.shapetable')
	os.system(f'combine_tessdata {lang}.')

	# 引擎 OCR_ENGINE_MODE
	# 0 = 'Legacy'
	# 1 = 'LSTM'

	# 模式 PAGE_SEG_MODE
	# 0 Orientation and script detection (OSD) only.
	# 1 Automatic page segmentation with OSD.
	# 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
	# 3 Fully automatic page segmentation, but no OSD. (Default)
	# 4 Assume a single column of text of variable sizes.
	# 5 Assume a single uniform block of vertically aligned text.
	# 6 Assume a single uniform block of text.
	# 7 Treat the image as a single text line.
	# 8 Treat the image as a single word.
	# 9 Treat the image as a single word in a circle.
	# 10 Treat the image as a single character.
	# 11 Sparse text. Find as much text as possible in no particular order.
	# 12 Sparse text with OSD.
	# 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.

	# 格式(加在語言後) '' \| 'hocr' \| 'tsv'
	os.system(f'tesseract {train}.jpg result -l {lang} --oem 1 --psm 3')