Last active
February 24, 2016 03:39
-
-
Save vsemozhetbyt/bfa76deac0d374b6b276 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /******************************************************************************/ | |
| 'use strict'; | |
| const fs = require('fs'); | |
| const pth = require('path'); | |
| const readline = require('readline'); | |
| const nwWin = nw.Window.get(); | |
| const nwDoc = window.document; | |
| const eTocFile = nwDoc.querySelector('#tocFile'); | |
| const eOutputDir = nwDoc.querySelector('#outputDir'); | |
| const eSave = nwDoc.querySelector('#save'); | |
| const eInfo = nwDoc.querySelector('#info'); | |
| const eAudio = nwDoc.querySelector('#audio'); | |
| const eBrowser = nwDoc.querySelector('#browser'); | |
| const config = {}; | |
| const io = {}; | |
| const formatNumberRE = /\B(?=(?:\d{3})+$)/g; | |
| const hour = 1000 * 60 * 60; | |
| const toc = []; | |
| const selectorsToCheck = ['#content-text #newsletter a[href]', | |
| '#content-text .index-words a[href]']; | |
| const selectorsToDelete = ['#content-text #social', '#content-text #newsletter']; | |
| const selectorsToSave = ['#content-text']; | |
| const checkFreq = 100; | |
| const headwordsBuffer = new Set(); | |
| let prevURL = ''; | |
| let currURL = ''; | |
| let restMark; | |
| let speedInfo = '?/h (?/min): ~? hours left, ~? days left.'; | |
| let saving = false; | |
| let stop = false; | |
| let exit = false; | |
| /******************************************************************************/ | |
| nwWin.on('close', onExit); | |
| try { | |
| Object.assign(config, JSON.parse( fs.readFileSync('config.json', 'utf8') )); | |
| eTocFile.setAttribute('nwworkingdir', pth.dirname(config.tocFile)); | |
| eOutputDir.setAttribute('nwworkingdir', config.outputDir); | |
| } catch(e) {} | |
| eTocFile.addEventListener('change', checkDirs); | |
| eOutputDir.addEventListener('change', checkDirs); | |
| eSave.addEventListener('click', saveDic); | |
| checkDirs(); | |
| /******************************************************************************/ | |
| function checkDirs() { | |
| config.tocFile = io.tocFile = eTocFile.title = eTocFile.value; | |
| config.outputDir = io.outputDir = eOutputDir.title = eOutputDir.value; | |
| if (io.tocFile && io.outputDir) { | |
| eInfo.textContent = ''; | |
| eSave.disabled = false; | |
| fs.writeFileSync('config.json', JSON.stringify(config), 'utf8'); | |
| } else { | |
| eInfo.textContent = 'Select the TOC file and the output directory please.'; | |
| eSave.disabled = true; | |
| } | |
| } | |
| /******************************************************************************/ | |
| function onStop() { | |
| stop = true; | |
| } | |
| /******************************************************************************/ | |
| function onExit() { | |
| if (saving) { | |
| if (confirm('Do you want to exit? Dictionary saving will be suspended.')) { | |
| stop = exit = true; | |
| } else { | |
| return; | |
| } | |
| } else { | |
| nwWin.close(true); | |
| } | |
| } | |
| /******************************************************************************/ | |
| function setSpeedInfo() { | |
| const donePerHour = restMark - toc.length; | |
| const donePerMin = Math.round(donePerHour / 60); | |
| restMark = toc.length; | |
| const hoursLeft = Math.round(restMark / donePerHour); | |
| const daysLeft = Math.round(hoursLeft / 24); | |
| speedInfo = `${donePerHour}/h (${donePerMin}/min): ~${hoursLeft} hours left, ~${daysLeft} days left.`; | |
| eAudio.play(); | |
| } | |
| /******************************************************************************/ | |
| function updateInfo(str) { | |
| eInfo.textContent += `${str}\n`; | |
| if (/(?:.*\n){11}/.test(eInfo.textContent)) { | |
| eInfo.textContent = eInfo.textContent.replace(/[^]+((?:^.*\n){10})/m, '$1'); | |
| eInfo.scrollTop = eInfo.scrollHeight; | |
| } | |
| } | |
| /******************************************************************************/ | |
| function logError(evt) { | |
| if (currURL !== prevURL) { | |
| fs.writeSync(io.errFile, | |
| //////////////////////////////////////////////////////////////////////////////// | |
| `Iframe error (${new Date()}). | |
| ${currURL} | |
| ${JSON.stringify(evt)} | |
| ` | |
| //////////////////////////////////////////////////////////////////////////////// | |
| , null, 'utf8'); | |
| } | |
| eAudio.play(); | |
| } | |
| /******************************************************************************/ | |
| function secureLow(str) { | |
| return str.replace(/[\\\[\]{}@^~<>#()]/g, '\\$&'); | |
| } | |
| /******************************************************************************/ | |
| function secureHigh(str, isHeadword) { | |
| str = str.trim().replace(/[ \t]{2,}/g, ' '); | |
| if (!isHeadword) { | |
| str = str.replace(/^[ \t]+|[ \t]+$/gm, '') | |
| .replace(/\n/g, '\n\t') | |
| .replace(/(?:\n\t){2,}/g, '\n\t\\ \n\t'); | |
| } | |
| return str; | |
| } | |
| /******************************************************************************/ | |
| function saveDic() { | |
| saving = true; | |
| eSave.removeEventListener('click', saveDic); | |
| eSave.textContent = 'Stop'; | |
| eSave.addEventListener('click', onStop); | |
| eTocFile.disabled = true; | |
| eOutputDir.disabled = true; | |
| try { | |
| fs.accessSync(io.tocFile); | |
| } catch(e) { | |
| eInfo.textContent = `TOC file not available: '${e}.'`; | |
| eTocFile.disabled = false; | |
| eOutputDir.disabled = false; | |
| eAudio.play(); | |
| } | |
| const rl = readline.createInterface({ | |
| input: fs.createReadStream(io.tocFile, 'utf8'), | |
| terminal: false, | |
| historySize: 0 | |
| }); | |
| io.dicFile = fs.openSync(pth.join(io.outputDir, 'WordSpy.dic.dsl'), 'a'); | |
| io.logFile = fs.openSync(pth.join(io.outputDir, 'WordSpy.dic.log'), 'a+'); | |
| io.errFile = fs.openSync(pth.join(io.outputDir, 'WordSpy.dic.errors.log'), 'a'); | |
| if (fs.fstatSync(io.dicFile).size === 0) { | |
| fs.writeSync(io.dicFile, '\uFEFF' + | |
| //////////////////////////////////////////////////////////////////////////////// | |
| `#NAME "Word Spy 2016 (Eng-Eng)" | |
| #INDEX_LANGUAGE "English" | |
| #CONTENTS_LANGUAGE "English" | |
| ` | |
| //////////////////////////////////////////////////////////////////////////////// | |
| , null, 'utf8'); | |
| } | |
| eInfo.textContent = 'Reading the TOC file...\n'; | |
| rl.on('line', line => { | |
| line = line.trim(); | |
| if (line) toc.push(line); | |
| }).on('close', () => { | |
| if (toc.length) { | |
| eBrowser.addEventListener('load', checkDoc); | |
| eBrowser.addEventListener('error', logError); | |
| if (fs.fstatSync(io.logFile).size !== 0) { | |
| const rl = readline.createInterface({ | |
| input: fs.createReadStream(null, {encoding: 'utf8', fd: io.logFile, autoClose: false}), | |
| terminal: false, | |
| historySize: 0 | |
| }); | |
| updateInfo('Reading the log file...'); | |
| let lastLine; | |
| rl.on('line', line => { | |
| line = line.trim(); | |
| if (line) lastLine = line; | |
| }).on('close', () => { | |
| toc.splice(0, toc.indexOf(lastLine) + 1); | |
| restMark = toc.length; | |
| global.setInterval(setSpeedInfo, hour).unref(); | |
| updateInfo( | |
| nwDoc.title = nwWin.title = | |
| `Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.` | |
| ); | |
| getDoc(toc.shift()); | |
| }); | |
| } else { | |
| restMark = toc.length; | |
| global.setInterval(setSpeedInfo, hour).unref(); | |
| updateInfo( | |
| nwDoc.title = nwWin.title = | |
| `Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.` | |
| ); | |
| getDoc(toc.shift()); | |
| } | |
| } else { | |
| eInfo.textContent = 'No URLs found.'; | |
| endSaving(); | |
| } | |
| }); | |
| } | |
| /******************************************************************************/ | |
| function getDoc(url) { | |
| if (stop) { | |
| updateInfo(nwDoc.title = nwWin.title = 'Stop on demand.'); | |
| endSaving(); | |
| return; | |
| } | |
| prevURL = currURL; | |
| currURL = url; | |
| updateInfo(` ${url}`); | |
| eBrowser.src = url; | |
| } | |
| /******************************************************************************/ | |
| function checkDoc() { | |
| const iWin = eBrowser.contentWindow; | |
| const iDoc = iWin.document; | |
| const iLoc = iWin.location.href; | |
| let iter = 0; | |
| const checker = global.setInterval(() => { | |
| iter++; | |
| if (iDoc.querySelector(selectorsToCheck.join(', '))) { | |
| global.clearInterval(checker); | |
| processDoc(iWin, iDoc, iLoc, iter); | |
| } else if (iter > 50) { | |
| global.clearInterval(checker); | |
| if (iLoc === currURL) { | |
| eAudio.play(); | |
| updateInfo( nwDoc.title = nwWin.title = 'HTTP error. Retrying...' ); | |
| getDoc(currURL); | |
| } else { | |
| fs.writeSync(io.errFile, | |
| //////////////////////////////////////////////////////////////////////////////// | |
| `Something wrong (${new Date()}). | |
| ${currURL} | |
| ` | |
| //////////////////////////////////////////////////////////////////////////////// | |
| , null, 'utf8'); | |
| updateInfo( nwDoc.title = nwWin.title = 'Something wrong...' ); | |
| endSaving(); | |
| } | |
| } | |
| }, checkFreq); | |
| } | |
| /******************************************************************************/ | |
| function processDoc(iWin, iDoc, iLoc, iter) { | |
| selectorsToDelete.forEach(s => { | |
| const e = iDoc.querySelector(s); | |
| if (e) e.parentNode.removeChild(e); | |
| }); | |
| const elmToSave = iDoc.querySelector(selectorsToSave.join(', ')); | |
| const textNodes = iDoc.evaluate( | |
| './/text()', elmToSave, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null | |
| ); | |
| for (let i = 0, t; t = textNodes.snapshotItem(i); i++) { | |
| t.textContent = secureLow(t.textContent); | |
| } | |
| if (iLoc.startsWith('http://www.wordspy.com/index.php?word=')) { | |
| headwordsBuffer.add(secureHigh(secureLow(iDoc.querySelector('#content-header').innerText), true)); | |
| if (elmToSave.querySelector('#synonyms .word-meta-data')) { | |
| elmToSave.querySelector('#synonyms .word-meta-data').innerText.split('·') | |
| .forEach(synonym => { headwordsBuffer.add(secureHigh(synonym, true)); }); | |
| } | |
| if (elmToSave.querySelector('#inflections .word-meta-data')) { | |
| Array.from(elmToSave.querySelectorAll('#inflections .italicized')) | |
| .forEach(abbrv => {abbrv.style.visibility = 'hidden';}); | |
| elmToSave.querySelector('#inflections .word-meta-data').innerText.split('·') | |
| .forEach(form => { | |
| headwordsBuffer.add(secureHigh(form.trim().replace(/\s+\.$/, ''), true)); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('#inflections .italicized')) | |
| .forEach(abbrv => {abbrv.style.visibility = 'visible';}); | |
| } | |
| } else { | |
| headwordsBuffer.add('\\# ' + secureHigh(secureLow(iDoc.querySelector('#content-header').innerText), true)); | |
| } | |
| Array.from(elmToSave.querySelectorAll('blockquote, div, h3, ol, p, table, ul')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('beforebegin', '<br><br>'); | |
| elm.insertAdjacentHTML('afterend', '<br><br>'); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('hr')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterend', '<br><br>----------<br><br>'); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('iframe')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterend', | |
| '<br><br>\\[Embeded video or page. See on the site.\\]<br><br>'); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('blockquote')) | |
| .forEach(elm => { | |
| if (elm.lastChild.tagName === 'BR') elm.removeChild(elm.lastChild); | |
| elm.insertAdjacentHTML('afterbegin', '“'); | |
| elm.insertAdjacentHTML('beforeend', '”'); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('ul')) | |
| .forEach(elm => { | |
| Array.from(elm.querySelectorAll('li')).forEach(li => { | |
| li.insertAdjacentHTML('afterbegin', '• '); | |
| }); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('ol')) | |
| .forEach(elm => { | |
| Array.from(elm.querySelectorAll('li')).forEach((li, i) => { | |
| li.insertAdjacentHTML('afterbegin', `${i + 1}. `); | |
| }); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('smirk, flame')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterbegin', `\\<${elm.tagName.toLowerCase()}\\>`); | |
| elm.insertAdjacentHTML('beforeend', `\\</${elm.tagName.toLowerCase()}\\>`); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('span.add-separator')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterbegin', ' · '); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('h3')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterbegin', '[b][c steelblue]'); | |
| elm.insertAdjacentHTML('beforeend', '[/c][/b]'); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('div.word-citation-year')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterbegin', '[b][c gray]'); | |
| elm.insertAdjacentHTML('beforeend', '[/c][/b]'); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('i, em, span.italicized')) | |
| .forEach(elm => { | |
| if (!iDoc.evaluate( './ancestor::*[@data-dsl-i]', elm, | |
| null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
| elm.insertAdjacentHTML('afterbegin', '[i]'); | |
| elm.insertAdjacentHTML('beforeend', '[/i]'); | |
| elm.setAttribute('data-dsl-i', ''); | |
| } | |
| }); | |
| Array.from(elmToSave.querySelectorAll('span')) | |
| .filter(elm => iWin.getComputedStyle(elm).fontStyle !== 'normal') | |
| .forEach(elm => { | |
| if (!iDoc.evaluate( './ancestor::*[@data-dsl-i]', elm, | |
| null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
| elm.insertAdjacentHTML('afterbegin', '[i]'); | |
| elm.insertAdjacentHTML('beforeend', '[/i]'); | |
| elm.setAttribute('data-dsl-i', ''); | |
| } | |
| }); | |
| Array.from(elmToSave.querySelectorAll('b, span.headword')) | |
| .forEach(elm => { | |
| if (!iDoc.evaluate( './ancestor::*[@data-dsl-b]', elm, | |
| null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
| elm.insertAdjacentHTML('afterbegin', '[b]'); | |
| elm.insertAdjacentHTML('beforeend', '[/b]'); | |
| elm.setAttribute('data-dsl-b', ''); | |
| } | |
| }); | |
| Array.from(elmToSave.querySelectorAll('span')) | |
| .filter(elm => iWin.getComputedStyle(elm).fontWeight === 'bold') | |
| .forEach(elm => { | |
| if (!iDoc.evaluate( './ancestor::*[@data-dsl-b]', elm, | |
| null, XPathResult.FIRST_ORDERED_NODE_TYPE, null ).singleNodeValue) { | |
| elm.insertAdjacentHTML('afterbegin', '[b]'); | |
| elm.insertAdjacentHTML('beforeend', '[/b]'); | |
| elm.setAttribute('data-dsl-b', ''); | |
| } | |
| }); | |
| Array.from(elmToSave.querySelectorAll('sup')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterbegin', '[sup]'); | |
| elm.insertAdjacentHTML('beforeend', '[/sup]'); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('a[href]')) | |
| .forEach(elm => { | |
| if (elm.href.replace(/http:\/\/(?:www\.)?/, '') === | |
| elm.innerText.trim().replace(/http:\/\/(?:www\.)?/, '')) { | |
| elm.insertAdjacentHTML('afterbegin', '[url]'); | |
| elm.insertAdjacentHTML('beforeend', '[/url]'); | |
| } else if (/http:\/\/www\.wordspy\.com\/index\.php\?word=/.test(elm.href)) { | |
| elm.insertAdjacentHTML('afterbegin', '[ref]'); | |
| elm.insertAdjacentHTML('beforeend', '[/ref]'); | |
| } else if (/http:\/\/www\.wordspy\.com\/index\.php\?tag=/.test(elm.href)) { | |
| elm.insertAdjacentHTML('afterbegin', '[ref]\\# '); | |
| elm.insertAdjacentHTML('beforeend', '[/ref]'); | |
| } else { | |
| if (elm.innerText.trim()) { | |
| elm.insertAdjacentHTML('afterbegin', '[u]'); | |
| elm.insertAdjacentHTML('beforeend', '[/u]'); | |
| } | |
| elm.insertAdjacentHTML('beforeend', ` ([url]${secureLow(elm.href)}[/url])`); | |
| } | |
| }); | |
| Array.from(elmToSave.querySelectorAll('img')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterend', | |
| `<br><br>[url]${secureLow(elm.src)}[/url]<br><br>`); | |
| }); | |
| Array.from(elmToSave.querySelectorAll('div.word-citation')) | |
| .forEach(elm => { | |
| elm.insertAdjacentHTML('afterbegin', '[m2]'); | |
| elm.insertAdjacentHTML('beforeend', '[/m]'); | |
| }); | |
| fs.writeSync(io.dicFile, | |
| //////////////////////////////////////////////////////////////////////////////// | |
| `${Array.from(headwordsBuffer).join('\n')} | |
| ${ secureHigh(elmToSave.innerText) } | |
| ` | |
| //////////////////////////////////////////////////////////////////////////////// | |
| , null, 'utf8'); | |
| fs.writeSync(io.logFile, `${iLoc}\n`, null, 'utf8'); | |
| updateInfo( ` ${(iter * checkFreq / 1000).toFixed(1)} s. ${speedInfo}` ); | |
| headwordsBuffer.clear(); | |
| if (toc.length) { | |
| updateInfo( | |
| nwDoc.title = nwWin.title = | |
| `Rest: ${ toc.length.toString().replace(formatNumberRE, ' ') }.`); | |
| getDoc(toc.shift()); | |
| } else { | |
| updateInfo( nwDoc.title = nwWin.title = 'Saving complete.' ); | |
| endSaving(); | |
| } | |
| } | |
| /******************************************************************************/ | |
| function endSaving() { | |
| io.dicFile && fs.closeSync(io.dicFile); | |
| io.logFile && fs.closeSync(io.logFile); | |
| io.errFile && fs.closeSync(io.errFile); | |
| io.dicFile = io.logFile = io.errFile = null; | |
| eBrowser.removeEventListener('load', checkDoc); | |
| eBrowser.removeEventListener('error', logError); | |
| eSave.removeEventListener('click', onStop); | |
| eSave.textContent = 'Save'; | |
| eSave.addEventListener('click', saveDic); | |
| eTocFile.disabled = false; | |
| eOutputDir.disabled = false; | |
| saving = false; | |
| eBrowser.src = 'about:blank'; | |
| eAudio.play(); | |
| if (exit) { | |
| nwWin.close(true); | |
| } | |
| } | |
| /******************************************************************************/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment