Created
January 19, 2020 17:15
-
-
Save milahu/592b49a7f38d49324219decbb93374fd to your computer and use it in GitHub Desktop.
show all additions in wikipedia page history
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// show all additions in wikipedia page history | |
// | |
// get the XML file from | |
// https://en.wikipedia.org/wiki/Special:Export | |
// | |
// Add pages manually: | |
// Skibadee | |
// | |
// [ ] Include only the current revision, not the full history | |
// ^ uncheck this | |
// | |
// set filename in variable | |
// wikipedia_xml_file | |
// | |
// install dependencies | |
// npm i xml2js diff | |
// | |
// run script | |
// node skibadee.js | |
// | |
// comment the line | |
// if (ip == 1) { break } | |
// to see all pages | |
// | |
// license = CC0-1.0 | |
const fs = require('fs') | |
const xml2js = require('xml2js') | |
const jsdiff = require('diff') | |
// edit this | |
const wikipedia_xml_file = 'Wikipedia-20200119163602.xml' | |
const src = fs.readFileSync(wikipedia_xml_file).toString('utf-8') | |
xml2js.parseString(src, function (err, result) { | |
let text_last = '' | |
for (let [ip, page] of result.mediawiki.page.entries()) { | |
// comment the next line to see all pages | |
if (ip == 1) { break } // flood limit | |
for (let [ir, revision] of page.revision.entries()) { | |
//if (ir == 20) { break } // flood limit | |
console.log(`# revision ${revision.id}`) | |
const text = revision.text[0]._ | |
if (!text) { continue } | |
// find changed words | |
const diff = jsdiff.diffWords(text_last, text) | |
for (let d of diff) { | |
// print added words | |
if (d.added) { | |
console.log(d.value) | |
} | |
} | |
text_last = text | |
} | |
} | |
}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment