Skip to content

Instantly share code, notes, and snippets.

@billju
Last active February 4, 2022 00:19
Show Gist options
  • Save billju/479510ea0d3f1301f158f5d5adb2b0ce to your computer and use it in GitHub Desktop.
Save billju/479510ea0d3f1301f158f5d5adb2b0ce to your computer and use it in GitHub Desktop.
const puppeteer = require('puppeteer-core');
const fs = require('fs');
const path = require('path');
const glob = require('glob').sync;
const { execSync } = require('child_process');
const edgeExe = 'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe';
/**
* 全國電子公佈欄爬蟲
* https://www.odbbs.gov.tw/odbbs/html/announce.jsp
*/
async function fetch_odbbs() {
const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
const page = await browser.newPage();
await page.goto('https://www.odbbs.gov.tw/odbbs/html/announce.jsp');
await page.waitForSelector('[name=fryear]');
// 調整查詢起始日
await page.select('[name=fryear]', '109');
await page.select('[name=frmonth]', '01');
await page.select('[name=frdate]', '01');
await page.click('input[value="查 詢 "]');
await page.waitForTimeout(3000);
const total = await page.$eval('#total2', (el) => el.textContent);
while (true) {
// 點擊附件按鈕
const tb_list_trs = await page.$$('#tb_list tr:not(:first-child)');
await page.waitForTimeout(500);
for (let tr of tb_list_trs) {
const docNo = await tr.$eval('td:nth-child(3)', (el) => el.textContent);
// 建立資料夾、改寫下載地點
const downloadPath = path.join(__dirname, 'odbbs/' + docNo);
console.log(downloadPath);
if (fs.existsSync(downloadPath)) continue;
else fs.mkdirSync(downloadPath, { recursive: true });
await page._client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath });
// 進入下載頁面,寫tr.click('td:last-child img')會無效
await tr.$('td:last-child img').then((el) => el.click());
await page.waitForSelector('#att_download');
await page.waitForTimeout(1000);
// 對每個檔案點擊下載按鈕
const att_trs = await page.$$('#att_download tr:not(:first-child)');
for (let t of att_trs) {
await t.$('td:last-child a').then((el) => el.click());
await page.waitForTimeout(2000);
}
await page.click('#box_05 input[value="返 回 列 表 "]');
await page.waitForTimeout(500);
// 判斷是否為最後一筆
const index = await tr.$eval('td:first-child', (el) => el.textContent);
if (index == total) break;
}
await page.click('img[title=下一頁]');
await page.waitForTimeout(500);
}
}
/**
* PDF轉成訓練資料集
* https://mozilla.github.io/pdf.js/web/viewer.html
*/
async function pdf2image() {
const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false });
const page = await browser.newPage();
// 將畫面調整到A4大小
await page.setViewport({ width: parseInt(210 * 3.78) + 20, height: parseInt(297 * 3.78) + 40 });
await page.goto('https://mozilla.github.io/pdf.js/web/viewer.html');
await page.waitForSelector('input[type=file]');
const fileInput = await page.$('input[type=file]');
for (let pdfFile of glob(path.join(__dirname, 'odbbs/**/*.pdf'))) {
// 上傳PDF檔
fileInput.uploadFile(pdfFile);
await page.waitForTimeout(2000);
await page.select('#scaleSelect', 'page-actual');
await page.waitForTimeout(500);
const totalPage = await page.$eval('#pageNumber', (el) => el.max);
let curPage = 1;
while (curPage <= totalPage) {
// 取出每個字元框
const { box, clip } = await page.$eval(`#viewer .page:nth-child(${curPage}) .textLayer`, (el) => {
const { x, y, width, height } = el.getBoundingClientRect();
const range = new Range();
let box = '';
for (let span of el.querySelectorAll('span[role=presentation]')) {
// 排除裝訂線、頁碼
if (parseFloat(span.style.left) < 70) continue;
if (parseFloat(span.style.top) > 800) continue;
const textNode = span.firstChild;
for (let i = 0; i < textNode.length; i++) {
range.setStart(textNode, i);
range.setEnd(textNode, i + 1);
let { top, left, right, bottom } = range.getBoundingClientRect();
const char = textNode.textContent.slice(i, i + 1);
if (char == ' ') continue;
const rect = [left - x, height - bottom + y, right - x, height - top + y].map((x) => parseInt(x)).join(' ');
box += char + ' ' + rect + ' 0\n';
}
}
return { box, clip: { x, y, width, height } };
});
// 標註頁數並換下一頁
const name = path.basename(pdfFile).replace('.pdf', '') + 'P' + curPage++;
const boxFile = path.join(__dirname, 'odbbs/box', name + '.box');
const jpgFile = path.join(__dirname, 'odbbs/jpg', name + '.jpg');
// 跳過文字過少的訓練資料
if (box.length < 100) continue;
fs.writeFileSync(boxFile, box);
await page.screenshot({ path: jpgFile, clip });
await page.click('button#next');
await page.waitForTimeout(1000);
}
}
}
/**
* 訓練OCR引擎
* 官方說明 https://tesseract-ocr.github.io/tessdoc/tess3/Training-Tesseract-3.03%E2%80%933.05.html
* 引擎下載(建議第四版) https://github.com/UB-Mannheim/tesseract/wiki
* 權重下載(有版本問題,建議安裝時選取language data>chi_tra) https://github.com/tesseract-ocr/tessdata_best
* 環境變數 C:\Program Files (x86)\Tesseract-OCR\
*/
function trainTesseract() {
process.chdir(path.join(__dirname, 'odbbs/tessdata'));
console.log(process.cwd());
const lang = 'chi_tra';
const font = 'kaiu';
// 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr)
const jpgs = glob('../jpg/*.jpg');
let boxes = glob('../box/*.box');
for (let i = 0; i < jpgs.length; i++) {
let train = `${lang}.${font}.exp${i}`;
if (fs.existsSync(train + '.tr')) continue;
fs.copyFileSync(jpgs[i], train + '.jpg');
fs.copyFileSync(boxes[i], train + '.box');
// execSync(`tesseract -l ${lang} ${train}.jpg ${train} batch.nochop makebox`);
execSync(`tesseract -l ${lang} ${train}.jpg ${train} box.train`);
}
boxes = glob('*.box').join(' ');
const trs = glob('*.tr').join(' ');
// 製作字型屬性
fs.writeFileSync('font_properties', `${lang} 0 0 0 1 0`);
execSync(`unicharset_extractor --output_unicharset unicharset ${boxes}`);
execSync(`mftraining -F font_properties -U unicharset -O ${lang}.unicharset -D . ${trs}`);
execSync(`cntraining -D . ${trs}`);
fs.renameSync('inttemp', `${lang}.inttemp`);
fs.renameSync('normproto', `${lang}.normproto`);
fs.renameSync('pffmtable', `${lang}.pffmtable`);
fs.renameSync('shapetable', `${lang}.shapetable`);
// 合併檔案以產生權重
execSync(`combine_tessdata ${lang}.`);
// 重新命名避開現有名稱,並搬到資料夾下(注意Program Files會被擋權限)
fs.renameSync(`${lang}.traineddata`, `${lang}_test.traineddata`);
fs.copyFileSync(`${lang}_test.traineddata`, 'C:/Program Files (x86)/Tesseract-OCR/tessdata/');
}
/**
* 測試辨識結果
*/
function testTesseract() {
/**
* 引擎 OCR_ENGINE_MODE
* 0 = 'Legacy'
* 1 = 'LSTM'
*
* 模式 PAGE_SEG_MODE
* 0 Orientation and script detection (OSD) only.
* 1 Automatic page segmentation with OSD.
* 2 Automatic page segmentation, but no OSD, or OCR. (not implemented)
* 3 Fully automatic page segmentation, but no OSD. (Default)
* 4 Assume a single column of text of variable sizes.
* 5 Assume a single uniform block of vertically aligned text.
* 6 Assume a single uniform block of text.
* 7 Treat the image as a single text line.
* 8 Treat the image as a single word.
* 9 Treat the image as a single word in a circle.
* 10 Treat the image as a single character.
* 11 Sparse text. Find as much text as possible in no particular order.
* 12 Sparse text with OSD.
* 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
*
* 格式(加在語言後) '' | 'hocr' | 'tsv' | 'batch.nochop makebox'
*/
process.chdir(path.join(__dirname, 'odbbs/tessdata'));
const lang = 'chi_tra_test';
const testImage = glob('*.jpg')[0];
const textFile = 'result'; // 會自動添加.txt
execSync(`tesseract ${testImage} ${textFile} -l ${lang} --oem 1 --psm 3`);
console.log('odbbs/tessdata/' + testImage, 'odbbs/tessdata/' + textFile + '.txt');
}
/**
* 建立空資料夾
*/
for (let dir of ['jpg', 'box', 'tessdata'])
if (!fs.existsSync(__dirname + '/odbbs/' + dir)) fs.mkdirSync(__dirname + '/odbbs/' + dir, { recursive: true });
/**
* 執行by指令(node scripts/odbbs-crawler [引數])
*/
if (process.argv[2] == 'fetch') fetch_odbbs();
if (process.argv[2] == 'image') pdf2image();
if (process.argv[2] == 'train') trainTesseract();
if (process.argv[2] == 'test') testTesseract();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment