Last active
July 21, 2023 12:36
-
-
Save cometkim/2d6ade96329edfccff0fd8da57010f31 to your computer and use it in GitHub Desktop.
Convert Wikipedia page-articles data (XML) into a text dataset (NDJSON)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Convert Wikipedia page articles dump (XML) into a stream of JSON | |
// { id: 0, "title": "...", "text": "..." } | |
// The "text" field format will also be converted into plain text | |
import * as path from 'node:path'; | |
import * as fs from 'node:fs'; | |
import XMLParser from 'node-xml-stream'; | |
import ndjson from 'ndjson'; | |
import instaview from 'instaview'; | |
import htmlEntities from 'html-entities'; | |
import * as htmlToText from 'html-to-text'; | |
const [arg] = process.argv.slice(2); | |
const filePath = path.resolve(arg); | |
const inputStream = fs.createReadStream(filePath, 'utf8'); | |
const xmlStream = new XMLParser(); | |
const jsonStream = ndjson.stringify(); | |
let state = 'idle'; | |
let doc = { id: -1, title: '', text: '' }; | |
xmlStream.on('opentag', (name, attrs) => { | |
switch (name) { | |
case 'page': { | |
if (state === 'idle') { | |
state = 'page'; | |
} | |
break; | |
} | |
case 'id': { | |
if (state === 'page') { | |
state = 'id'; | |
} | |
break; | |
} | |
case 'title': { | |
if (state === 'page') { | |
state = 'title'; | |
} | |
break; | |
} | |
case 'revision': { | |
if (state === 'page') { | |
state = 'revision'; | |
} | |
break; | |
} | |
case 'text': { | |
if (state === 'revision') { | |
state = 'text'; | |
} | |
break; | |
} | |
} | |
}); | |
xmlStream.on('text', text => { | |
switch (state) { | |
case 'title': { | |
doc.title = text; | |
break; | |
} | |
case 'id': { | |
doc.id = +text; | |
break; | |
} | |
case 'text': { | |
try { | |
doc.text = formatText(text); | |
} catch { | |
console.error(`failed to format doc(id=${doc.id}, title=${doc.title}))`); | |
} | |
break; | |
} | |
} | |
}); | |
xmlStream.on('closetag', name => { | |
switch (name) { | |
case 'page': { | |
if (state === 'page') { | |
jsonStream.write(doc); | |
state = 'idle'; | |
} | |
break; | |
} | |
case 'id': { | |
if (state === 'id') { | |
state = 'page'; | |
} | |
break; | |
} | |
case 'title': { | |
if (state === 'title') { | |
state = 'page'; | |
} | |
break; | |
} | |
case 'revision': { | |
if (state === 'revision') { | |
state = 'page'; | |
} | |
break; | |
} | |
case 'text': { | |
if (state === 'text') { | |
state = 'revision'; | |
} | |
break; | |
} | |
} | |
}); | |
inputStream.pipe(xmlStream); | |
jsonStream.pipe(process.stdout); | |
function stripWikiTags(markup) { | |
return markup | |
// strip llang tags | |
.replace(/{{llang\|\w+\|([^}]*)}}/g, '$1') | |
// strip all others | |
.replace(/{{[^}]*}}\s?/g, '') | |
; | |
} | |
function formatText(text) { | |
text = htmlEntities.decode(text); | |
text = instaview.convert(text); | |
text = stripWikiTags(text); | |
return htmlToText.convert(text, { | |
wordwrap: false, | |
selectors: [ | |
{ selector: 'a', options: { ignoreHref: true } }, | |
{ selector: 'img', format: 'skip' }, | |
], | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment