Skip to content

Instantly share code, notes, and snippets.

@vsemozhetbyt
Last active October 16, 2016 23:52
Show Gist options
  • Select an option

  • Save vsemozhetbyt/08840cea83929643cc55c9ed6f6379fa to your computer and use it in GitHub Desktop.

Select an option

Save vsemozhetbyt/08840cea83929643cc55c9ed6f6379fa to your computer and use it in GitHub Desktop.
/******************************************************************************/
'use strict';
/******************************************************************************/
const fs = require('fs');
const pth = require('path');
const rl = require('readline');
const urlMod = require('url');
const parseURL = urlMod.parse;
const formatURL = urlMod.format;
/******************************************************************************/
const opts = {
tpcURL: 'http://www.multitran.ru/c/m.exe?a=112&l1=1&l2=2',
outDir: __dirname,
stopAfterTopics: false,
stopAfterTOC: false,
filterTopics: false,
};
process.argv.slice(2).forEach(arg => {
if (arg === '-topics') opts.stopAfterTopics = true;
else if (arg === '-toc') opts.stopAfterTOC = true;
else if (arg === '-filter') opts.filterTopics = true;
else if (/^https?:\x2f\x2f/i.test(arg)) opts.tpcURL = arg;
else opts.outDir = arg;
});
console.log(xs`
Stop after topics saved: ${opts.stopAfterTopics}.
Stop after TOC saved: ${opts.stopAfterTOC}.
Filter dictionary by topics: ${opts.filterTopics}.
`);
/******************************************************************************/
try {
if (fs.statSync(opts.outDir).isDirectory()) {
console.log(`Output directory: ${opts.outDir}\n`);
} else {
console.error(`${opts.outDir} is not a directory. Please enter another one.`);
process.exit(1);
}
} catch (err) {
console.error(err.toString());
process.exit(1);
}
/******************************************************************************/
const { l1: langFromId, l2: langIntoId } = parseURL(opts.tpcURL, true).query;
if (!langFromId || !langIntoId) {
console.error(`Wrong URL: ${opts.tpcURL}`);
process.exit(1);
}
const langMap = {
1: 'English',
2: 'Russian',
3: 'German',
4: 'French',
5: 'Spanish',
6: 'Hebrew',
7: 'Serbian',
8: 'Croatian',
9: 'Tatar',
10: 'Arabic',
11: 'Portuguese',
12: 'Lithuanian',
13: 'Romanian',
14: 'Polish',
15: 'Bulgarian',
16: 'Czech',
17: 'Chinese',
18: 'Hindi',
19: 'Bengali',
20: 'Punjabi',
21: 'Vietnamese',
22: 'Danish',
23: 'Italian',
24: 'Dutch',
25: 'Azerbaijani',
26: 'Estonian',
27: 'Latvian',
28: 'Japanese',
29: 'Swedish',
30: 'Norwegian Bokmål',
31: 'Afrikaans',
32: 'Turkish',
33: 'Ukrainian',
34: 'Esperanto',
35: 'Kalmyk',
36: 'Finnish',
37: 'Latin',
38: 'Greek',
39: 'Korean',
40: 'Georgian',
41: 'Armenian',
42: 'Hungarian',
43: 'Kazakh',
44: 'Kirghiz',
45: 'Uzbek',
46: 'Romany',
47: 'Albanian',
48: 'Welsh',
49: 'Irish',
50: 'Icelandic',
51: 'Kurdish',
52: 'Persian',
53: 'Catalan',
54: 'Corsican',
55: 'Galician',
56: 'Mirandese',
57: 'Romansh',
58: 'Belarusian',
59: 'Ruthene',
60: 'Slovak',
61: 'Upper Sorbian',
62: 'Lower Sorbian',
63: 'Bosnian',
64: 'Montenegrin',
65: 'Macedonian',
66: 'Old Church Slavonic',
67: 'Slovenian',
68: 'Basque',
69: 'Svan',
70: 'Mingrelian',
71: 'Abkhazian',
72: 'Adyghe',
73: 'Chechen',
74: 'Avar',
75: 'Ingush',
76: 'Crimean Tatar',
77: 'Chuvash',
78: 'Maltese',
79: 'Khmer',
80: 'Nepali',
81: 'Amharic',
82: 'Assamese',
83: 'Lao',
84: 'Asturian',
85: 'Odia',
86: 'Indonesian',
87: 'Pashto',
88: 'Quechua',
89: 'Maori',
90: 'Marathi',
91: 'Tamil',
92: 'Telugu',
93: 'Thai',
94: 'Turkmen',
95: 'Yoruba',
96: 'Bosnian cyrillic',
97: 'Chinese simplified',
98: 'Chinese Taiwan',
99: 'Filipino',
100: 'Gujarati',
101: 'Hausa',
102: 'Igbo',
103: 'Inuktitut',
104: 'Isixhosa',
105: 'Zulu',
106: 'Kannada',
107: 'Kinyarwanda',
108: 'Swahili',
109: 'Konkani',
110: 'Luxembourgish',
111: 'Malayalam',
112: 'Wolof',
113: 'Guajiro',
114: 'Serbian latin',
115: 'Tswana',
116: 'Sinhala',
117: 'Urdu',
118: 'Sesotho sa leboa',
119: 'Norwegian Nynorsk',
120: 'Malay',
121: 'Mongolian',
122: 'Frisian',
123: 'Faroese',
124: 'Friulian',
125: 'Ladin',
126: 'Sardinian',
127: 'Occitan',
128: 'Gaulish',
129: 'Gallegan',
130: 'Sami',
131: 'Breton',
132: 'Cornish',
133: 'Manh',
134: 'Scottish Gaelic',
135: 'Yiddish',
136: 'Tajik',
};
const {
[langFromId]: langFrom = 'Unidentified',
[langIntoId]: langInto = 'Unidentified',
} = langMap;
console.log(xs`
Language to translate from: ${langFrom}.
Language to translate into: ${langInto}.
`);
const fnPrefix = `MT.${langFrom}-${langInto}`.replace(/\s+/g, '_');
const urlLangSuffix = `_${langFromId}_${langIntoId}`;
const urlLangSuffixRE = new RegExp(`${urlLangSuffix}$`);
/******************************************************************************/
const outCoding = 'utf8';
const bomRE = /^\uFEFF/;
const BOM_SIZE = 3;
const userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 ' +
'(KHTML, like Gecko) Chrome/52.0.2741.0 Safari/537.36';
const timeout = 3e3;
const topics = [];
const currTopic = { name: '', url: '' };
const hwLinksBufPrev = { fullHrefs: [], keyParams: [] };
const HW_LINKS_BUF_LIMIT = 60;
let prevURL = '';
let currURL = '';
let terminate = false;
process.on('SIGINT', () => { terminate = true; });
/******************************************************************************/
const charset = 'windows-1251';
const http = require(parseURL(opts.tpcURL).protocol.replace(/:$/, ''));
const iconv = require('iconv-lite');
process.stdout.write('Loading jsdom... ');
const jsdom = require('jsdom');
jsdom.defaultDocumentFeatures = {
FetchExternalResources: false,
ProcessExternalResources: false,
};
const virtualConsole = jsdom.createVirtualConsole().sendTo(console);
console.log('Loaded.\n');
/******************************************************************************/
const fdd = new Proxy(
{
prop2params: {
errFile: [`${fnPrefix}.err.log`, 'a'],
reqFile: [`${fnPrefix}.req.log`, 'a+'],
tocRawFile: [`${fnPrefix}.toc_raw.txt`, 'a+'],
tocSortedFile: [`${fnPrefix}.toc_sorted.txt`, 'a+'],
tocFilteredFile: [`${fnPrefix}.toc_filtered.txt`, 'a+'],
dicFile: [`${fnPrefix}.dic.dsl`, 'a'],
},
},
{
get(target, property) {
if (!target[property]) {
const [name, flag] = target.prop2params[property];
target[property] = fs.openSync(pth.join(opts.outDir, name), flag);
if (fs.fstatSync(target[property]).size === 0) {
fs.writeSync(target[property], '\uFEFF', null, outCoding);
}
}
return target[property];
},
}
);
/******************************************************************************/
const outputFiles = fs.readdirSync(opts.outDir)
.filter(fName => fName.startsWith(`${fnPrefix}.`));
if (fileExistsAndHasData(`${fnPrefix}.dic.dsl`)) restoreDicStat();
else if (fileExistsAndHasData(`${fnPrefix}.toc_filtered.txt`)) initDic();
else if (fileExistsAndHasData(`${fnPrefix}.toc_sorted.txt`)) filterTOC();
else if (fileExistsAndHasData(`${fnPrefix}.toc_raw.txt`)) restoreTOCStat();
else if (fileExistsAndHasData(`${fnPrefix}.tpc.txt`)) restoreTopicsStat();
else initTopics();
/******************************************************************************/
/******************************************************************************/
/******************************************************************************/
function initTopics() {
console.log('Loading topics list...\n');
getDoc(opts.tpcURL, processTopicsDoc);
}
/******************************************************************************/
function restoreTopicsStat(returnToRestoreTOCStat) {
console.log('Reading topics list...\n');
fs.readFileSync(pth.join(opts.outDir, `${fnPrefix}.tpc.txt`), outCoding)
.split(/(?:\r?\n)+/)
.forEach(record => {
const pair = record.trim().split(/: +(?=https?:\x2f\x2f)/, 2);
if (pair.length === 2) topics.push(pair);
});
if (topics.length) {
console.log(`Topics to load: ${topics.length}.\n`);
if (!opts.stopAfterTopics && !returnToRestoreTOCStat) initTOC();
} else {
console.error('Something wrong with the topics file: no valid records found.');
process.exit(1);
}
}
/******************************************************************************/
function processTopicsDoc(doc) {
Array.from(doc.querySelectorAll('a[href]'))
.filter(lnk => {
const q = parseURL(lnk.href, true).query;
return q.a === '110' &&
q.l1 === langFromId &&
q.l2 === langIntoId &&
q.sc !== undefined;
})
.forEach(lnk => {
topics.push([lnk.textContent.trim().replace(/\s+/g, ' '), lnk.href]);
});
if (topics.length) {
fs.writeFileSync(pth.join(opts.outDir, `${fnPrefix}.tpc.txt`),
`\uFEFF${topics.map(pair => pair.join(': ')).join('\n')}\n`,
outCoding);
console.log(`Topics to load: ${topics.length}.\n`);
if (!opts.stopAfterTopics) initTOC();
} else {
console.error('Something wrong with the topics page: no proper links found.');
process.exit(1);
}
}
/******************************************************************************/
function nextTopic() {
[currTopic.name, currTopic.url] = topics.shift();
return currTopic.url;
}
/******************************************************************************/
/******************************************************************************/
/******************************************************************************/
function initTOC() {
console.log('Loading TOC (headwords list)...\n');
getDoc(nextTopic(), processTOCDoc);
}
/******************************************************************************/
function restoreTOCStat() {
if (!topics.length) restoreTopicsStat(true);
console.log('Reading TOC log file...\n');
let lineNumber = 0;
let lastLine;
rl.createInterface({
input: fs.createReadStream(null, {
fd: fdd.reqFile, encoding: outCoding, autoClose: false,
}),
}).on('line', line => {
if (++lineNumber === 1) line = line.replace(bomRE, '');
line = line.trim();
if (line) lastLine = line;
}).on('close', () => {
const topicId = parseURL(lastLine, true).query.sc;
if (topicId) {
topics.splice(0, topics.findIndex(
t => parseURL(t[1], true).query.sc === topicId
));
nextTopic();
getDoc(lastLine, processTOCDoc, true);
} else {
console.error(xs`
Something wrong with the requests log file:
no topic id found in the last URL.
`);
process.exit(1);
}
});
}
/******************************************************************************/
function processTOCDoc(doc, saved) {
const empties = [];
const hwLinksBufCurr = { fullHrefs: [], keyParams: [] };
const hwLinks = [...doc.querySelectorAll('td:first-child a[href]')]
.filter(lnk => {
const q = parseURL(lnk.href, true).query;
if (q.t && q.t.endsWith(urlLangSuffix)) {
if (lnk.textContent) {
return true;
}
empties.push(lnk.href);
return false;
}
return false;
})
.map(lnk => lnk.href);
if (hwLinks.length || empties.length || saved) {
if (!saved) {
if (hwLinks.length) {
hwLinks.forEach(lnk => {
const s1 = parseURL(lnk, true).query.s1;
if (s1) {
if (!hwLinksBufCurr.keyParams.includes(s1) &&
!hwLinksBufPrev.keyParams.includes(s1)) {
hwLinksBufCurr.fullHrefs.push(lnk);
hwLinksBufCurr.keyParams.push(s1);
const destituteUrl = stripParams(lnk, 's1');
const dupIxCurr = hwLinksBufCurr.fullHrefs.lastIndexOf(destituteUrl);
if (dupIxCurr > -1) hwLinksBufCurr.fullHrefs.splice(dupIxCurr, 1);
const dupIxPrev = hwLinksBufPrev.fullHrefs.lastIndexOf(destituteUrl);
if (dupIxPrev > -1) hwLinksBufPrev.fullHrefs.splice(dupIxPrev, 1);
}
} else {
if (!hwLinksBufCurr.fullHrefs.includes(lnk) &&
!hwLinksBufPrev.fullHrefs.includes(lnk)) {
hwLinksBufCurr.fullHrefs.push(lnk);
}
}
});
if (hwLinksBufCurr.fullHrefs.length) {
fs.writeSync(fdd.tocRawFile,
`${hwLinksBufCurr.fullHrefs.join('\n')}\n`,
null, outCoding);
const lastHWLink = doc.querySelector(
`td:first-child a[href*='${parseURL(hwLinks[hwLinks.length - 1], true).query.t}']`
);
progressInfo(
`...${lastHWLink.parentNode.textContent.trim()} (${currTopic.name})`
);
hwLinksBufPrev.fullHrefs.push(...hwLinksBufCurr.fullHrefs);
let excess = hwLinksBufPrev.fullHrefs.length - HW_LINKS_BUF_LIMIT;
if (excess > 0) {
hwLinksBufPrev.fullHrefs.splice(0, excess);
}
hwLinksBufPrev.keyParams.push(...hwLinksBufCurr.keyParams);
excess = hwLinksBufPrev.keyParams.length - HW_LINKS_BUF_LIMIT;
if (excess > 0) {
hwLinksBufPrev.keyParams.splice(0, excess);
}
}
}
if (empties.length) {
logError(
'Empty links detected',
` ${empties.join('\n ')}`,
`${currTopic.name}: empty links detected. See error log.`
);
}
fs.writeSync(fdd.reqFile, `${currURL}\n`, null, outCoding);
}
const nextPageLink = [...doc.querySelectorAll('a[href*="recno="]')]
.filter(lnk => lnk.textContent.trim() === '>>')
.pop();
if (nextPageLink) {
getDoc(nextPageLink.href, processTOCDoc);
} else {
if (topics.length) {
getDoc(nextTopic(), processTOCDoc);
} else {
console.log('\nTOC saved.\n');
sortTOC();
}
}
} else {
logError(
'Parsing error',
'No expected data found on the page.',
xs`
Parsing error: no expected data found on the page.
(${currURL})
`
);
process.exit(1);
}
}
/******************************************************************************/
function sortTOC() {
console.log('Sorting TOC...');
const sorter = [];
let lineNumber = 0;
rl.createInterface({
input: fs.createReadStream(null, {
fd: fdd.tocRawFile, encoding: outCoding, start: 0,
}),
}).on('line', line => {
if (++lineNumber === 1) line = line.replace(bomRE, '');
line = line.trim();
if (line) {
sorter.push(line);
}
}).on('close', () => {
sorter.sort((a, b) => {
const { s1: as1, t: at } = parseURL(a, true).query;
const { s1: bs1, t: bt } = parseURL(b, true).query;
if (as1 && bs1) {
if (as1 < bs1) return -1;
if (as1 > bs1) return 1;
return 0;
}
if (as1) return 1;
if (bs1) return -1;
return Number(at.replace(urlLangSuffixRE, '')) -
Number(bt.replace(urlLangSuffixRE, ''));
});
sorter.forEach(url => {
fs.writeSync(fdd.tocSortedFile, `${url}\n`, null, outCoding);
});
sorter.length = 0;
console.log('TOC sorted.\n');
filterTOC();
});
}
/******************************************************************************/
function filterTOC() {
console.log('Filtering out duplicate URLs in TOC...\n');
const fullHrefFilter = new Set();
const keyParamFilter = new Set();
let lineNumber = 0;
let urls = 0;
rl.createInterface({
input: fs.createReadStream(null, {
fd: fdd.tocSortedFile, encoding: outCoding, start: 0,
}),
}).on('line', line => {
if (++lineNumber === 1) line = line.replace(bomRE, '');
line = line.trim();
if (line) {
urls++;
const s1 = parseURL(line, true).query.s1;
if (s1) {
if (!keyParamFilter.has(s1)) {
fullHrefFilter.add(line);
keyParamFilter.add(s1);
fullHrefFilter.delete(stripParams(line, 's1'));
}
} else {
fullHrefFilter.add(line);
}
}
}).on('close', () => {
fullHrefFilter.forEach(url => {
fs.writeSync(fdd.tocFilteredFile, `${url}\n`, null, outCoding);
});
if (fullHrefFilter.size < urls) {
console.log(xs`
TOC reduced from ${urls} to ${fullHrefFilter.size} URLs.
Filtered out dublicates: ${urls - fullHrefFilter.size}.
`);
} else {
console.log(`No duplicate headwords found.\n`);
}
fullHrefFilter.clear();
keyParamFilter.clear();
if (!opts.stopAfterTOC) initDic();
});
}
/******************************************************************************/
/******************************************************************************/
/******************************************************************************/
function initDic() {
console.log('Loading dictionary articles...\n');
}
/******************************************************************************/
function restoreDicStat() {
}
/******************************************************************************/
function processDicDoc(doc) {
}
/******************************************************************************/
/******************************************************************************/
/******************************************************************************/
function getDoc(url, processFunction, saved) {
if (terminate) {
console.log('Exit on demand.');
process.exit();
}
const STATUS_CODE_OK = 200;
const STATUS_CODE_SERVER_ERROR = 500;
const { hostname, path } = parseURL(url);
const processHTTPError = err => {
logError('HTTP error', err.toString(), 'HTTP error. Retrying...');
getDoc(url, processFunction, saved);
};
[prevURL, currURL] = [currURL, url];
setTimeout(() => {
http.request({
hostname, path, method: 'GET', headers: { 'User-Agent': userAgent },
}).on('response', resp => {
const { statusCode, statusMessage } = resp;
if (statusCode === STATUS_CODE_OK) {
let html = '';
let stream;
if (charset === 'utf8') {
resp.setEncoding(charset);
stream = resp;
} else {
stream = resp.pipe(iconv.decodeStream(charset));
}
stream.on('data', chunk => {
html += chunk;
}).on('end', () => {
const doc = jsdom.jsdom(html, { url, userAgent, virtualConsole });
processFunction(doc, saved);
}).on('error', processHTTPError);
} else if (statusCode >= STATUS_CODE_SERVER_ERROR) {
processHTTPError(`Server Error: ${statusCode} ${statusMessage}.`);
} else {
logError(
'HTTP error or suddenness',
`${statusCode}: ${statusMessage}.`,
xs`
HTTP error or suddenness: ${statusCode} ${statusMessage}.
(${currURL})
`
);
process.exit(1);
}
}).on('error', processHTTPError)
.end();
}, timeout);
}
/******************************************************************************/
function logError(header, details, consoleInfo) {
if (prevURL !== currURL) {
fs.writeSync(fdd.errFile, xs`
${header} (${(new Date()).toLocaleString()})
${currURL}
${details}\n\n
`, null, outCoding);
}
if (consoleInfo) console.error(consoleInfo);
}
/******************************************************************************/
function progressInfo(str) {
process.title = str;
console.log(str);
}
/******************************************************************************/
function fileExistsAndHasData(fName) {
return outputFiles.includes(fName) &&
fs.statSync(pth.join(opts.outDir, fName)).size > BOM_SIZE;
}
/******************************************************************************/
function stripParams(urlStr, ...params) {
const url = parseURL(urlStr, true);
delete url.search;
params.forEach(param => { delete url.query[param]; });
return formatURL(url);
}
/******************************************************************************/
// remove auxiliary code spaces in template strings
function xs(strings, ...expressions) {
const firstIndentRE = /^ +/m;
const indentSize = firstIndentRE.exec(
strings.find(str => firstIndentRE.test(str))
)[0].length - 1;
const xLfRE = /^\n|\n$/g;
const xSpRE = new RegExp(`^ {0,${indentSize}}`, 'gm');
if (!expressions.length) return strings[0].replace(xSpRE, '').replace(xLfRE, '');
return strings.reduce((acc, str, i) =>
(i === 1 ? acc.replace(xSpRE, '') : acc) +
expressions[i - 1] +
str.replace(xSpRE, '')
).replace(xLfRE, '');
}
/******************************************************************************/
/******************************************************************************/
/******************************************************************************/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment