Last active
February 4, 2022 00:19
-
-
Save billju/479510ea0d3f1301f158f5d5adb2b0ce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer-core'); | |
const fs = require('fs'); | |
const path = require('path'); | |
const glob = require('glob').sync; | |
const { execSync } = require('child_process'); | |
const edgeExe = 'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe'; | |
/** | |
* 全國電子公佈欄爬蟲 | |
* https://www.odbbs.gov.tw/odbbs/html/announce.jsp | |
*/ | |
async function fetch_odbbs() { | |
const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false }); | |
const page = await browser.newPage(); | |
await page.goto('https://www.odbbs.gov.tw/odbbs/html/announce.jsp'); | |
await page.waitForSelector('[name=fryear]'); | |
// 調整查詢起始日 | |
await page.select('[name=fryear]', '109'); | |
await page.select('[name=frmonth]', '01'); | |
await page.select('[name=frdate]', '01'); | |
await page.click('input[value="查 詢 "]'); | |
await page.waitForTimeout(3000); | |
const total = await page.$eval('#total2', (el) => el.textContent); | |
while (true) { | |
// 點擊附件按鈕 | |
const tb_list_trs = await page.$$('#tb_list tr:not(:first-child)'); | |
await page.waitForTimeout(500); | |
for (let tr of tb_list_trs) { | |
const docNo = await tr.$eval('td:nth-child(3)', (el) => el.textContent); | |
// 建立資料夾、改寫下載地點 | |
const downloadPath = path.join(__dirname, 'odbbs/' + docNo); | |
console.log(downloadPath); | |
if (fs.existsSync(downloadPath)) continue; | |
else fs.mkdirSync(downloadPath, { recursive: true }); | |
await page._client.send('Page.setDownloadBehavior', { behavior: 'allow', downloadPath }); | |
// 進入下載頁面,寫tr.click('td:last-child img')會無效 | |
await tr.$('td:last-child img').then((el) => el.click()); | |
await page.waitForSelector('#att_download'); | |
await page.waitForTimeout(1000); | |
// 對每個檔案點擊下載按鈕 | |
const att_trs = await page.$$('#att_download tr:not(:first-child)'); | |
for (let t of att_trs) { | |
await t.$('td:last-child a').then((el) => el.click()); | |
await page.waitForTimeout(2000); | |
} | |
await page.click('#box_05 input[value="返 回 列 表 "]'); | |
await page.waitForTimeout(500); | |
// 判斷是否為最後一筆 | |
const index = await tr.$eval('td:first-child', (el) => el.textContent); | |
if (index == total) break; | |
} | |
await page.click('img[title=下一頁]'); | |
await page.waitForTimeout(500); | |
} | |
} | |
/** | |
* PDF轉成訓練資料集 | |
* https://mozilla.github.io/pdf.js/web/viewer.html | |
*/ | |
async function pdf2image() { | |
const browser = await puppeteer.launch({ executablePath: edgeExe, defaultViewport: null, headless: false }); | |
const page = await browser.newPage(); | |
// 將畫面調整到A4大小 | |
await page.setViewport({ width: parseInt(210 * 3.78) + 20, height: parseInt(297 * 3.78) + 40 }); | |
await page.goto('https://mozilla.github.io/pdf.js/web/viewer.html'); | |
await page.waitForSelector('input[type=file]'); | |
const fileInput = await page.$('input[type=file]'); | |
for (let pdfFile of glob(path.join(__dirname, 'odbbs/**/*.pdf'))) { | |
// 上傳PDF檔 | |
fileInput.uploadFile(pdfFile); | |
await page.waitForTimeout(2000); | |
await page.select('#scaleSelect', 'page-actual'); | |
await page.waitForTimeout(500); | |
const totalPage = await page.$eval('#pageNumber', (el) => el.max); | |
let curPage = 1; | |
while (curPage <= totalPage) { | |
// 取出每個字元框 | |
const { box, clip } = await page.$eval(`#viewer .page:nth-child(${curPage}) .textLayer`, (el) => { | |
const { x, y, width, height } = el.getBoundingClientRect(); | |
const range = new Range(); | |
let box = ''; | |
for (let span of el.querySelectorAll('span[role=presentation]')) { | |
// 排除裝訂線、頁碼 | |
if (parseFloat(span.style.left) < 70) continue; | |
if (parseFloat(span.style.top) > 800) continue; | |
const textNode = span.firstChild; | |
for (let i = 0; i < textNode.length; i++) { | |
range.setStart(textNode, i); | |
range.setEnd(textNode, i + 1); | |
let { top, left, right, bottom } = range.getBoundingClientRect(); | |
const char = textNode.textContent.slice(i, i + 1); | |
if (char == ' ') continue; | |
const rect = [left - x, height - bottom + y, right - x, height - top + y].map((x) => parseInt(x)).join(' '); | |
box += char + ' ' + rect + ' 0\n'; | |
} | |
} | |
return { box, clip: { x, y, width, height } }; | |
}); | |
// 標註頁數並換下一頁 | |
const name = path.basename(pdfFile).replace('.pdf', '') + 'P' + curPage++; | |
const boxFile = path.join(__dirname, 'odbbs/box', name + '.box'); | |
const jpgFile = path.join(__dirname, 'odbbs/jpg', name + '.jpg'); | |
// 跳過文字過少的訓練資料 | |
if (box.length < 100) continue; | |
fs.writeFileSync(boxFile, box); | |
await page.screenshot({ path: jpgFile, clip }); | |
await page.click('button#next'); | |
await page.waitForTimeout(1000); | |
} | |
} | |
} | |
/** | |
* 訓練OCR引擎 | |
* 官方說明 https://tesseract-ocr.github.io/tessdoc/tess3/Training-Tesseract-3.03%E2%80%933.05.html | |
* 引擎下載(建議第四版) https://github.com/UB-Mannheim/tesseract/wiki | |
* 權重下載(有版本問題,建議安裝時選取language data>chi_tra) https://github.com/tesseract-ocr/tessdata_best | |
* 環境變數 C:\Program Files (x86)\Tesseract-OCR\ | |
*/ | |
function trainTesseract() { | |
process.chdir(path.join(__dirname, 'odbbs/tessdata')); | |
console.log(process.cwd()); | |
const lang = 'chi_tra'; | |
const font = 'kaiu'; | |
// 複製圖片檔(.jpg)、產生辨識框(.box)、訓練資料(.tr) | |
const jpgs = glob('../jpg/*.jpg'); | |
let boxes = glob('../box/*.box'); | |
for (let i = 0; i < jpgs.length; i++) { | |
let train = `${lang}.${font}.exp${i}`; | |
if (fs.existsSync(train + '.tr')) continue; | |
fs.copyFileSync(jpgs[i], train + '.jpg'); | |
fs.copyFileSync(boxes[i], train + '.box'); | |
// execSync(`tesseract -l ${lang} ${train}.jpg ${train} batch.nochop makebox`); | |
execSync(`tesseract -l ${lang} ${train}.jpg ${train} box.train`); | |
} | |
boxes = glob('*.box').join(' '); | |
const trs = glob('*.tr').join(' '); | |
// 製作字型屬性 | |
fs.writeFileSync('font_properties', `${lang} 0 0 0 1 0`); | |
execSync(`unicharset_extractor --output_unicharset unicharset ${boxes}`); | |
execSync(`mftraining -F font_properties -U unicharset -O ${lang}.unicharset -D . ${trs}`); | |
execSync(`cntraining -D . ${trs}`); | |
fs.renameSync('inttemp', `${lang}.inttemp`); | |
fs.renameSync('normproto', `${lang}.normproto`); | |
fs.renameSync('pffmtable', `${lang}.pffmtable`); | |
fs.renameSync('shapetable', `${lang}.shapetable`); | |
// 合併檔案以產生權重 | |
execSync(`combine_tessdata ${lang}.`); | |
// 重新命名避開現有名稱,並搬到資料夾下(注意Program Files會被擋權限) | |
fs.renameSync(`${lang}.traineddata`, `${lang}_test.traineddata`); | |
fs.copyFileSync(`${lang}_test.traineddata`, 'C:/Program Files (x86)/Tesseract-OCR/tessdata/'); | |
} | |
/** | |
* 測試辨識結果 | |
*/ | |
function testTesseract() { | |
/** | |
* 引擎 OCR_ENGINE_MODE | |
* 0 = 'Legacy' | |
* 1 = 'LSTM' | |
* | |
* 模式 PAGE_SEG_MODE | |
* 0 Orientation and script detection (OSD) only. | |
* 1 Automatic page segmentation with OSD. | |
* 2 Automatic page segmentation, but no OSD, or OCR. (not implemented) | |
* 3 Fully automatic page segmentation, but no OSD. (Default) | |
* 4 Assume a single column of text of variable sizes. | |
* 5 Assume a single uniform block of vertically aligned text. | |
* 6 Assume a single uniform block of text. | |
* 7 Treat the image as a single text line. | |
* 8 Treat the image as a single word. | |
* 9 Treat the image as a single word in a circle. | |
* 10 Treat the image as a single character. | |
* 11 Sparse text. Find as much text as possible in no particular order. | |
* 12 Sparse text with OSD. | |
* 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific. | |
* | |
* 格式(加在語言後) '' | 'hocr' | 'tsv' | 'batch.nochop makebox' | |
*/ | |
process.chdir(path.join(__dirname, 'odbbs/tessdata')); | |
const lang = 'chi_tra_test'; | |
const testImage = glob('*.jpg')[0]; | |
const textFile = 'result'; // 會自動添加.txt | |
execSync(`tesseract ${testImage} ${textFile} -l ${lang} --oem 1 --psm 3`); | |
console.log('odbbs/tessdata/' + testImage, 'odbbs/tessdata/' + textFile + '.txt'); | |
} | |
/** | |
* 建立空資料夾 | |
*/ | |
for (let dir of ['jpg', 'box', 'tessdata']) | |
if (!fs.existsSync(__dirname + '/odbbs/' + dir)) fs.mkdirSync(__dirname + '/odbbs/' + dir, { recursive: true }); | |
/** | |
* 執行by指令(node scripts/odbbs-crawler [引數]) | |
*/ | |
if (process.argv[2] == 'fetch') fetch_odbbs(); | |
if (process.argv[2] == 'image') pdf2image(); | |
if (process.argv[2] == 'train') trainTesseract(); | |
if (process.argv[2] == 'test') testTesseract(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment