billju · February 4, 2022 00:19
diff --git a/tesseract_ocr訓練.js b/tesseract_ocr訓練.js
 const puppeteer = require('puppeteer-core');
 const fs = require('fs');
 const path = require('path');
 const glob = require('glob').sync;
 const { execSync } = require('child_process');
 const edgeExe = 'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe';
 /**
 * 全國電子公佈欄爬蟲
 * https://www.odbbs.gov.tw/odbbs/html/announce.jsp
 */
 async function fetch_odbbs() {
 	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
 	const page = await browser.newPage();
 	await page.goto('https://www.odbbs.gov.tw/odbbs/html/announce.jsp');
 	await page.waitForSelector('[name=fryear]');
 	// 調整查詢起始日
 	await page.select('[name=fryear]', '109');
 	await page.select('[name=frmonth]', '01');
 	await page.select('[name=frdate]', '01');
 	await page.click('input[value="查 詢 "]');
 	await page.waitForTimeout(3000);
 	const total = await page.$eval('#total2', (el) => el.textContent);
 	while (true) {
 		// 點擊附件按鈕
 		const tb_list_trs = await page.$$('#tb_list tr:not(:first-child)');
 		await page.waitForTimeout(500);
 		for (let tr of tb_list_trs) {
 			const docNo = await tr.$eval('td:nth-child(3)', (el) => el.textContent);
 			// 建立資料夾、改寫下載地點
 			const downloadPath = path.join(__dirname, 'odbbs/' + docNo);
 			console.log(downloadPath);
 			if (fs.existsSync(downloadPath)) continue;
 			else fs.mkdirSync(downloadPath, { recursive: true });
 			await page._client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath });
 			// 進入下載頁面，寫tr.click('td:last-child img')會無效
 			await tr.$('td:last-child img').then((el) => el.click());
 			await page.waitForSelector('#att_download');
 			await page.waitForTimeout(1000);
 			// 對每個檔案點擊下載按鈕
 			const att_trs = await page.$$('#att_download tr:not(:first-child)');
 			for (let t of att_trs) {
 				await t.$('td:last-child a').then((el) => el.click());
 				await page.waitForTimeout(2000);
 			}
 			await page.click('#box_05 input[value="返 回 列 表 "]');
 			await page.waitForTimeout(500);
 			// 判斷是否為最後一筆
 			const index = await tr.$eval('td:first-child', (el) => el.textContent);
 			if (index == total) break;
 		}
 		await page.click('img[title=下一頁]');
 		await page.waitForTimeout(500);
 	}
 }
 /**
 * PDF轉成訓練資料集
 * https://mozilla.github.io/pdf.js/web/viewer.html
 */
 async function pdf2image() {
 	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
 	const page = await browser.newPage();
 	// 將畫面調整到A4大小
 	await page.setViewport({ width: parseInt(210 * 3.78) + 20, height: parseInt(297 * 3.78) + 40 });
 	await page.goto('https://mozilla.github.io/pdf.js/web/viewer.html');
 	await page.waitForSelector('input[type=file]');
 	const fileInput = await page.$('input[type=file]');
 	for (let pdfFile of glob(path.join(__dirname, 'odbbs/**/*.pdf'))) {
 		// 上傳PDF檔
 		fileInput.uploadFile(pdfFile);
 		await page.waitForTimeout(2000);
 		await page.select('#scaleSelect', 'page-actual');
 		await page.waitForTimeout(500);
 		const totalPage = await page.$eval('#pageNumber', (el) => el.max);
 		let curPage = 1;
 		while (curPage <= totalPage) {
 			// 取出每個字元框
 			const { box, clip } = await page.$eval(`#viewer .page:nth-child(${curPage}) .textLayer`, (el) => {
 				const { x, y, width, height } = el.getBoundingClientRect();
 				const range = new Range();
 				let box = '';
 				for (let span of el.querySelectorAll('span[role=presentation]')) {
 					// 排除裝訂線、頁碼
 					if (parseFloat(span.style.left) < 70) continue;
 					if (parseFloat(span.style.top) > 800) continue;
 					const textNode = span.firstChild;
 					for (let i = 0; i < textNode.length; i++) {
 						range.setStart(textNode, i);
 						range.setEnd(textNode, i + 1);
 						let { top, left, right, bottom } = range.getBoundingClientRect();
 						const char = textNode.textContent.slice(i, i + 1);
 						if (char == ' ') continue;
 						const rect = [left - x, height - bottom + y, right - x, height - top + y].map((x) => parseInt(x)).join(' ');
 						box += char + ' ' + rect + ' 0\n';
 					}
 				}
 				return { box, clip: { x, y, width, height } };
 			});
 			// 標註頁數並換下一頁
 			const name = path.basename(pdfFile).replace('.pdf', '') + 'P' + curPage++;
 			const boxFile = path.join(__dirname, 'odbbs/box', name + '.box');
 			const jpgFile = path.join(__dirname, 'odbbs/jpg', name + '.jpg');
 			// 跳過文字過少的訓練資料
 			if (box.length < 100) continue;
 			fs.writeFileSync(boxFile, box);
 			await page.screenshot({ path: jpgFile, clip });
 			await page.click('button#next');
 			await page.waitForTimeout(1000);
 		}
 	}
 }
 /**
 * 訓練OCR引擎
 * 官方說明 https://tesseract-ocr.github.io/tessdoc/tess3/Training-Tesseract-3.03%E2%80%933.05.html
 * 引擎下載(建議第四版) https://github.com/UB-Mannheim/tesseract/wiki
 * 權重下載(有版本問題，建議安裝時選取language data>chi_tra) https://github.com/tesseract-ocr/tessdata_best
 * 環境變數 C:\Program Files (x86)\Tesseract-OCR\
 */
 function trainTesseract() {
 	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
 	console.log(process.cwd());
 	const lang = 'chi_tra';
 	const font = 'kaiu';
 	// 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
 	const jpgs = glob('../jpg/*.jpg');
 	let boxes = glob('../box/*.box');
 	for (let i = 0; i < jpgs.length; i++) {
 		let train = `${lang}.${font}.exp${i}`;
 		if (fs.existsSync(train + '.tr')) continue;
 		fs.copyFileSync(jpgs[i], train + '.jpg');
 		fs.copyFileSync(boxes[i], train + '.box');
 		// execSync(`tesseract -l ${lang} ${train}.jpg ${train} batch.nochop makebox`);
 		execSync(`tesseract -l ${lang} ${train}.jpg ${train} box.train`);
 	}
 	boxes = glob('*.box').join(' ');
 	const trs = glob('*.tr').join(' ');
 	// 製作字型屬性
 	fs.writeFileSync('font_properties', `${lang} 0 0 0 1 0`);
 	execSync(`unicharset_extractor --output_unicharset unicharset ${boxes}`);
 	execSync(`mftraining -F font_properties -U unicharset -O ${lang}.unicharset -D . ${trs}`);
 	execSync(`cntraining -D . ${trs}`);
 	fs.renameSync('inttemp', `${lang}.inttemp`);
 	fs.renameSync('normproto', `${lang}.normproto`);
 	fs.renameSync('pffmtable', `${lang}.pffmtable`);
 	fs.renameSync('shapetable', `${lang}.shapetable`);
 	// 合併檔案以產生權重
 	execSync(`combine_tessdata ${lang}.`);
 	// 重新命名避開現有名稱，並搬到資料夾下(注意Program Files會被擋權限)
 	fs.renameSync(`${lang}.traineddata`, `${lang}_test.traineddata`);
 	fs.copyFileSync(`${lang}_test.traineddata`, 'C:/Program Files (x86)/Tesseract-OCR/tessdata/');
 }
 /**
 * 測試辨識結果
 */
 function testTesseract() {
 	/**
 	 * 引擎 OCR_ENGINE_MODE
 	 * 0 = 'Legacy'
 	 * 1 = 'LSTM'
 	 *
 	 * 模式 PAGE_SEG_MODE
 	 * 0  Orientation and script detection (OSD) only.
 	 * 1  Automatic page segmentation with OSD.
 	 * 2  Automatic page segmentation, but no OSD, or OCR. (not implemented)
 	 * 3  Fully automatic page segmentation, but no OSD. (Default)
 	 * 4  Assume a single column of text of variable sizes.
 	 * 5  Assume a single uniform block of vertically aligned text.
 	 * 6  Assume a single uniform block of text.
 	 * 7  Treat the image as a single text line.
 	 * 8  Treat the image as a single word.
 	 * 9  Treat the image as a single word in a circle.
 	 * 10 Treat the image as a single character.
 	 * 11 Sparse text. Find as much text as possible in no particular order.
 	 * 12 Sparse text with OSD.
 	 * 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
 	 *
 	 * 格式(加在語言後) '' | 'hocr' | 'tsv' | 'batch.nochop makebox'
 	 */
 	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
 	const lang = 'chi_tra_test';
 	const testImage = glob('*.jpg')[0];
 	const textFile = 'result'; // 會自動添加.txt
 	execSync(`tesseract ${testImage} ${textFile} -l ${lang} --oem 1 --psm 3`);
 	console.log('odbbs/tessdata/' + testImage, 'odbbs/tessdata/' + textFile + '.txt');
 }
 /**
 * 建立空資料夾
 */
 for (let dir of ['jpg', 'box', 'tessdata'])
 	if (!fs.existsSync(__dirname + '/odbbs/' + dir)) fs.mkdirSync(__dirname + '/odbbs/' + dir, { recursive: true });
 /**
 * 執行by指令(node scripts/odbbs-crawler [引數])
 */
 if (process.argv[2] == 'fetch') fetch_odbbs();
 if (process.argv[2] == 'image') pdf2image();
 if (process.argv[2] == 'train') trainTesseract();
 if (process.argv[2] == 'test') testTesseract();
	const puppeteer = require('puppeteer-core');
	const fs = require('fs');
	const path = require('path');
	const glob = require('glob').sync;
	const { execSync } = require('child_process');
	const edgeExe = 'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe';
	/**
	* 全國電子公佈欄爬蟲
	* https://www.odbbs.gov.tw/odbbs/html/announce.jsp
	*/
	async function fetch_odbbs() {
	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
	const page = await browser.newPage();
	await page.goto('https://www.odbbs.gov.tw/odbbs/html/announce.jsp');
	await page.waitForSelector('[name=fryear]');
	// 調整查詢起始日
	await page.select('[name=fryear]', '109');
	await page.select('[name=frmonth]', '01');
	await page.select('[name=frdate]', '01');
	await page.click('input[value="查詢 "]');
	await page.waitForTimeout(3000);
	const total = await page.$eval('#total2', (el) => el.textContent);
	while (true) {
	// 點擊附件按鈕
	const tb_list_trs = await page.$$('#tb_list tr:not(:first-child)');
	await page.waitForTimeout(500);
	for (let tr of tb_list_trs) {
	const docNo = await tr.$eval('td:nth-child(3)', (el) => el.textContent);
	// 建立資料夾、改寫下載地點
	const downloadPath = path.join(__dirname, 'odbbs/' + docNo);
	console.log(downloadPath);
	if (fs.existsSync(downloadPath)) continue;
	else fs.mkdirSync(downloadPath, { recursive: true });
	await page._client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath });
	// 進入下載頁面，寫tr.click('td:last-child img')會無效
	await tr.$('td:last-child img').then((el) => el.click());
	await page.waitForSelector('#att_download');
	await page.waitForTimeout(1000);
	// 對每個檔案點擊下載按鈕
	const att_trs = await page.$$('#att_download tr:not(:first-child)');
	for (let t of att_trs) {
	await t.$('td:last-child a').then((el) => el.click());
	await page.waitForTimeout(2000);
	}
	await page.click('#box_05 input[value="返回列表 "]');
	await page.waitForTimeout(500);
	// 判斷是否為最後一筆
	const index = await tr.$eval('td:first-child', (el) => el.textContent);
	if (index == total) break;
	}
	await page.click('img[title=下一頁]');
	await page.waitForTimeout(500);
	}
	}
	/**
	* PDF轉成訓練資料集
	* https://mozilla.github.io/pdf.js/web/viewer.html
	*/
	async function pdf2image() {
	const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
	const page = await browser.newPage();
	// 將畫面調整到A4大小
	await page.setViewport({ width: parseInt(210 * 3.78) + 20, height: parseInt(297 * 3.78) + 40 });
	await page.goto('https://mozilla.github.io/pdf.js/web/viewer.html');
	await page.waitForSelector('input[type=file]');
	const fileInput = await page.$('input[type=file]');
	for (let pdfFile of glob(path.join(__dirname, 'odbbs/*/.pdf'))) {
	// 上傳PDF檔
	fileInput.uploadFile(pdfFile);
	await page.waitForTimeout(2000);
	await page.select('#scaleSelect', 'page-actual');
	await page.waitForTimeout(500);
	const totalPage = await page.$eval('#pageNumber', (el) => el.max);
	let curPage = 1;
	while (curPage <= totalPage) {
	// 取出每個字元框
	const { box, clip } = await page.$eval(`#viewer .page:nth-child(${curPage}) .textLayer`, (el) => {
	const { x, y, width, height } = el.getBoundingClientRect();
	const range = new Range();
	let box = '';
	for (let span of el.querySelectorAll('span[role=presentation]')) {
	// 排除裝訂線、頁碼
	if (parseFloat(span.style.left) < 70) continue;
	if (parseFloat(span.style.top) > 800) continue;
	const textNode = span.firstChild;
	for (let i = 0; i < textNode.length; i++) {
	range.setStart(textNode, i);
	range.setEnd(textNode, i + 1);
	let { top, left, right, bottom } = range.getBoundingClientRect();
	const char = textNode.textContent.slice(i, i + 1);
	if (char == ' ') continue;
	const rect = [left - x, height - bottom + y, right - x, height - top + y].map((x) => parseInt(x)).join(' ');
	box += char + ' ' + rect + ' 0\n';
	}
	}
	return { box, clip: { x, y, width, height } };
	});
	// 標註頁數並換下一頁
	const name = path.basename(pdfFile).replace('.pdf', '') + 'P' + curPage++;
	const boxFile = path.join(__dirname, 'odbbs/box', name + '.box');
	const jpgFile = path.join(__dirname, 'odbbs/jpg', name + '.jpg');
	// 跳過文字過少的訓練資料
	if (box.length < 100) continue;
	fs.writeFileSync(boxFile, box);
	await page.screenshot({ path: jpgFile, clip });
	await page.click('button#next');
	await page.waitForTimeout(1000);
	}
	}
	}
	/**
	* 訓練OCR引擎
	* 官方說明 https://tesseract-ocr.github.io/tessdoc/tess3/Training-Tesseract-3.03%E2%80%933.05.html
	* 引擎下載(建議第四版) https://github.com/UB-Mannheim/tesseract/wiki
	* 權重下載(有版本問題，建議安裝時選取language data>chi_tra) https://github.com/tesseract-ocr/tessdata_best
	* 環境變數 C:\Program Files (x86)\Tesseract-OCR\
	*/
	function trainTesseract() {
	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
	console.log(process.cwd());
	const lang = 'chi_tra';
	const font = 'kaiu';
	// 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
	const jpgs = glob('../jpg/*.jpg');
	let boxes = glob('../box/*.box');
	for (let i = 0; i < jpgs.length; i++) {
	let train = `${lang}.${font}.exp${i}`;
	if (fs.existsSync(train + '.tr')) continue;
	fs.copyFileSync(jpgs[i], train + '.jpg');
	fs.copyFileSync(boxes[i], train + '.box');
	// execSync(`tesseract -l ${lang} ${train}.jpg ${train} batch.nochop makebox`);
	execSync(`tesseract -l ${lang} ${train}.jpg ${train} box.train`);
	}
	boxes = glob('*.box').join(' ');
	const trs = glob('*.tr').join(' ');
	// 製作字型屬性
	fs.writeFileSync('font_properties', `${lang} 0 0 0 1 0`);
	execSync(`unicharset_extractor --output_unicharset unicharset ${boxes}`);
	execSync(`mftraining -F font_properties -U unicharset -O ${lang}.unicharset -D . ${trs}`);
	execSync(`cntraining -D . ${trs}`);
	fs.renameSync('inttemp', `${lang}.inttemp`);
	fs.renameSync('normproto', `${lang}.normproto`);
	fs.renameSync('pffmtable', `${lang}.pffmtable`);
	fs.renameSync('shapetable', `${lang}.shapetable`);
	// 合併檔案以產生權重
	execSync(`combine_tessdata ${lang}.`);
	// 重新命名避開現有名稱，並搬到資料夾下(注意Program Files會被擋權限)
	fs.renameSync(`${lang}.traineddata`, `${lang}_test.traineddata`);
	fs.copyFileSync(`${lang}_test.traineddata`, 'C:/Program Files (x86)/Tesseract-OCR/tessdata/');
	}
	/**
	* 測試辨識結果
	*/
	function testTesseract() {
	/**
	* 引擎 OCR_ENGINE_MODE
	* 0 = 'Legacy'
	* 1 = 'LSTM'
	*
	* 模式 PAGE_SEG_MODE
	* 0 Orientation and script detection (OSD) only.
	* 1 Automatic page segmentation with OSD.
	* 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
	* 3 Fully automatic page segmentation, but no OSD. (Default)
	* 4 Assume a single column of text of variable sizes.
	* 5 Assume a single uniform block of vertically aligned text.
	* 6 Assume a single uniform block of text.
	* 7 Treat the image as a single text line.
	* 8 Treat the image as a single word.
	* 9 Treat the image as a single word in a circle.
	* 10 Treat the image as a single character.
	* 11 Sparse text. Find as much text as possible in no particular order.
	* 12 Sparse text with OSD.
	* 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
	*
	* 格式(加在語言後) '' \| 'hocr' \| 'tsv' \| 'batch.nochop makebox'
	*/
	process.chdir(path.join(__dirname, 'odbbs/tessdata'));
	const lang = 'chi_tra_test';
	const testImage = glob('*.jpg')[0];
	const textFile = 'result'; // 會自動添加.txt
	execSync(`tesseract ${testImage} ${textFile} -l ${lang} --oem 1 --psm 3`);
	console.log('odbbs/tessdata/' + testImage, 'odbbs/tessdata/' + textFile + '.txt');
	}
	/**
	* 建立空資料夾
	*/
	for (let dir of ['jpg', 'box', 'tessdata'])
	if (!fs.existsSync(__dirname + '/odbbs/' + dir)) fs.mkdirSync(__dirname + '/odbbs/' + dir, { recursive: true });
	/**
	* 執行by指令(node scripts/odbbs-crawler [引數])
	*/
	if (process.argv[2] == 'fetch') fetch_odbbs();
	if (process.argv[2] == 'image') pdf2image();
	if (process.argv[2] == 'train') trainTesseract();
	if (process.argv[2] == 'test') testTesseract();