cometkim · July 21, 2023 12:36
diff --git a/convert.mjs b/convert.mjs
 // Convert Wikipedia page articles dump (XML) into a stream of JSON
 //   { id: 0, "title": "...", "text": "..." }
 // The "text" field format will also be converted into plain text

 import * as path from 'node:path';
 import * as fs from 'node:fs';
 import XMLParser from 'node-xml-stream';
 import ndjson from 'ndjson';
 import instaview from 'instaview';
 import htmlEntities from 'html-entities';
 import * as htmlToText from 'html-to-text';

 const [arg] = process.argv.slice(2);
 const filePath = path.resolve(arg);

 const inputStream = fs.createReadStream(filePath, 'utf8');
 const xmlStream = new XMLParser();
 const jsonStream = ndjson.stringify();

 let state = 'idle';
 let doc = { id: -1, title: '', text: '' };

 xmlStream.on('opentag', (name, attrs) => {
  switch (name) {
    case 'page': {
      if (state === 'idle') {
        state = 'page';
      }
      break;
    }

    case 'id': {
      if (state === 'page') {
        state = 'id';
      }
      break;
    }

    case 'title': {
      if (state === 'page') {
        state = 'title';
      }
      break;
    }

    case 'revision': {
      if (state === 'page') {
        state = 'revision';
      }
      break;
    }

    case 'text': {
      if (state === 'revision') {
        state = 'text';
      }
      break;
    }
  }
 });

 xmlStream.on('text', text => {
  switch (state) {
    case 'title': {
      doc.title = text;
      break;
    }
    case 'id': {
      doc.id = +text;
      break;
    }
    case 'text': {
      try {
        doc.text = formatText(text);
      } catch {
        console.error(`failed to format doc(id=${doc.id}, title=${doc.title}))`);
      }
      break;
    }
  }
 });

 xmlStream.on('closetag', name => {
  switch (name) {
    case 'page': {
      if (state === 'page') {
        jsonStream.write(doc);
        state = 'idle';
      }
      break;
    }
    case 'id': {
      if (state === 'id') {
        state = 'page';
      }
      break;
    }
    case 'title': {
      if (state === 'title') {
        state = 'page';
      }
      break;
    }
    case 'revision': {
      if (state === 'revision') {
        state = 'page';
      }
      break;
    }
    case 'text': {
      if (state === 'text') {
        state = 'revision';
      }
      break;
    }
  }
 });

 inputStream.pipe(xmlStream);
 jsonStream.pipe(process.stdout);

 function stripWikiTags(markup) {
  return markup
    // strip llang tags
    .replace(/{{llang\|\w+\|([^}]*)}}/g, '$1')
    // strip all others
    .replace(/{{[^}]*}}\s?/g, '')
  ;
 }

 function formatText(text) {
  text = htmlEntities.decode(text);
  text = instaview.convert(text);
  text = stripWikiTags(text);
  return htmlToText.convert(text, {
    wordwrap: false,
    selectors: [
      { selector: 'a', options: { ignoreHref: true } },
      { selector: 'img', format: 'skip' },
    ],
  });
 }
	// Convert Wikipedia page articles dump (XML) into a stream of JSON
	// { id: 0, "title": "...", "text": "..." }
	// The "text" field format will also be converted into plain text

	import * as path from 'node:path';
	import * as fs from 'node:fs';
	import XMLParser from 'node-xml-stream';
	import ndjson from 'ndjson';
	import instaview from 'instaview';
	import htmlEntities from 'html-entities';
	import * as htmlToText from 'html-to-text';

	const [arg] = process.argv.slice(2);
	const filePath = path.resolve(arg);

	const inputStream = fs.createReadStream(filePath, 'utf8');
	const xmlStream = new XMLParser();
	const jsonStream = ndjson.stringify();

	let state = 'idle';
	let doc = { id: -1, title: '', text: '' };

	xmlStream.on('opentag', (name, attrs) => {
	switch (name) {
	case 'page': {
	if (state === 'idle') {
	state = 'page';
	}
	break;
	}

	case 'id': {
	if (state === 'page') {
	state = 'id';
	}
	break;
	}

	case 'title': {
	if (state === 'page') {
	state = 'title';
	}
	break;
	}

	case 'revision': {
	if (state === 'page') {
	state = 'revision';
	}
	break;
	}

	case 'text': {
	if (state === 'revision') {
	state = 'text';
	}
	break;
	}
	}
	});

	xmlStream.on('text', text => {
	switch (state) {
	case 'title': {
	doc.title = text;
	break;
	}
	case 'id': {
	doc.id = +text;
	break;
	}
	case 'text': {
	try {
	doc.text = formatText(text);
	} catch {
	console.error(`failed to format doc(id=${doc.id}, title=${doc.title}))`);
	}
	break;
	}
	}
	});

	xmlStream.on('closetag', name => {
	switch (name) {
	case 'page': {
	if (state === 'page') {
	jsonStream.write(doc);
	state = 'idle';
	}
	break;
	}
	case 'id': {
	if (state === 'id') {
	state = 'page';
	}
	break;
	}
	case 'title': {
	if (state === 'title') {
	state = 'page';
	}
	break;
	}
	case 'revision': {
	if (state === 'revision') {
	state = 'page';
	}
	break;
	}
	case 'text': {
	if (state === 'text') {
	state = 'revision';
	}
	break;
	}
	}
	});

	inputStream.pipe(xmlStream);
	jsonStream.pipe(process.stdout);

	function stripWikiTags(markup) {
	return markup
	// strip llang tags
	.replace(/{{llang\\|\w+\\|([^}]*)}}/g, '$1')
	// strip all others
	.replace(/{{[^}]*}}\s?/g, '')
	;
	}

	function formatText(text) {
	text = htmlEntities.decode(text);
	text = instaview.convert(text);
	text = stripWikiTags(text);
	return htmlToText.convert(text, {
	wordwrap: false,
	selectors: [
	{ selector: 'a', options: { ignoreHref: true } },
	{ selector: 'img', format: 'skip' },
	],
	});
	}