Last active
February 18, 2019 07:13
-
-
Save SphinxKnight/f3b232450fea15cf0118ed2a207b6060 to your computer and use it in GitHub Desktop.
MDN static scan
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const os = require("os"); | |
| const fs = require("fs"); | |
| const { sep } = require("path"); | |
| const sqlite3 = require("sqlite3"); | |
| const dbName = "mdn_stats.db"; | |
| const dbPath = os.tmpdir() + sep + "scanMDNstats" + sep + dbName; | |
| const cachePath = os.tmpdir() + sep + "scanMDNstats"; | |
| const recapFileName = "recap.json"; | |
| const macroList = ["Compat", "Languages", "EmbedInteractiveExample"]; | |
| /** | |
| * Get a given file from the cache | |
| * @async | |
| * @param {string} filename the name of the file to get | |
| * @returns {Promise<string>} a promise which resolves with the content | |
| * of the file | |
| */ | |
| const getFileCache = function(filename) { | |
| return new Promise((resolve, reject) =>{ | |
| fs.readFile(cachePath + sep + filename, "utf8", (err, data)=>{ | |
| if(err) { | |
| reject(err); | |
| } else { | |
| resolve(data); | |
| } | |
| }); | |
| }); | |
| }; | |
| // Insert localized pages | |
| const createDBPage = function(page, parentPageURL){ | |
| const fieldStrings = macroList.map(macroName => "has_" + macroName.toLowerCase()).join(", "); | |
| const fieldAliases = macroList.map(macroName => "$has_" + macroName.toLowerCase()).join(", "); | |
| const fieldContent = { | |
| $last_edit: page.last_edit, | |
| $last_real_edit: page.lastEditTime, | |
| $url: page.url, | |
| $locale: page.locale, | |
| $parent_page: parentPageURL, | |
| $nb_rev: page.nbRevision, | |
| $has_sidebar: page.hasSidebar | |
| }; | |
| macroList.forEach(macroName => fieldContent["$has_" + macroName.toLowerCase()] = (page["has" + macroName] ? 1 : 0)); | |
| const stmt = db.prepare("INSERT INTO page (last_edit, last_real_edit, url, locale," + fieldStrings + ", has_sidebar, parent_page, nb_rev) VALUES ($last_edit, $last_real_edit, $url, $locale," + fieldAliases + ", $has_sidebar, $parent_page, $nb_rev)", fieldContent); | |
| stmt.run(); | |
| stmt.finalize(); | |
| }; | |
| const db = new sqlite3.Database(dbPath); | |
| const tableDefsMacro = macroList.map(macroName => "has_" + macroName.toLowerCase() + " INTEGER NOT NULL").join(", "); | |
| db.run("CREATE TABLE page ( last_edit TEXT NOT NULL, last_real_edit TEXT NOT NULL, url TEXT NOT NULL, locale TEXT NOT NULL," + tableDefsMacro + ", has_sidebar TEXT, parent_page TEXT, nb_rev INTEGER NOT NULL, PRIMARY KEY(url))", [], ()=>undefined); | |
| getFileCache(recapFileName).then(content => { | |
| const allPages = JSON.parse(content); | |
| allPages.forEach(page => createDBPage(page,null)); | |
| allPages.forEach(page => { | |
| page.locales.forEach(pageL => createDBPage(pageL,page.url)); | |
| }); | |
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const os = require("os"); | |
| const fs = require("fs"); | |
| const { sep } = require("path"); | |
| const waitingTimeMS = 120; | |
| const waitingTimeMSHistory = 3000; | |
| const localeEn = "en-US"; | |
| const sections = [ | |
| {name: "Learn", prefix: "docs", sidebarMacros: ["LearnSidebar"]}, | |
| {name: "Mozilla", prefix: "docs", sidebarMacros: []}, | |
| {name: "HTML", prefix: "docs/Web", sidebarMacros: ["HTMLRef", "HTMLSidebar"]}, | |
| {name: "CSS", prefix: "docs/Web", sidebarMacros: ["CSSRef"]}, | |
| {name: "JavaScript", prefix: "docs/Web", sidebarMacros: ["JSRef", "JSSidebar"]} | |
| ]; | |
| const coreUsersList = [ | |
| {name: "fscholz" , locales: ["en-US"]}, | |
| {name: "wbamberg" , locales: ["en-US"]}, | |
| {name: "chrisdavidmills" , locales: ["en-US"]}, | |
| {name: "Sheppy" , locales: ["en-US"]}, | |
| {name: "jswisher" , locales: ["en-US"]}, | |
| {name: "Jeremie" , locales: ["en-US"]}, | |
| {name: "teoli" , locales: ["en-US"]}, | |
| {name: "jwhitlock" , locales: ["en-US"]}, | |
| {name: "SphinxKnight" , locales: ["en-US","fr"]} | |
| ]; | |
| const baseURL = "https://developer.mozilla.org"; | |
| const detailSuffix = "$children?expand"; | |
| const historySuffix = "$history?limit=all"; | |
| const rawSuffix = "?raw"; | |
| const macroList = ["Compat", "Languages", "EmbedInteractiveExample"]; | |
| const cachePath = os.tmpdir() + sep + "scanMDNstats"; | |
| const recapPath = os.tmpdir() + sep + "scanMDNstats" + sep + "recap.json"; | |
| /** | |
| * Get the content of a given page through cache or web | |
| * Uses a Promise. | |
| * @async | |
| * @param {string} slug The slug "/<locale>/docs/X/Y" pointing to the page | |
| * @returns {Promise<string>} a Promise which resolves into the content of the page | |
| */ | |
| const getPageContent = function(slug) { | |
| const shortSlug = slug.startsWith("/") ? slug.substring(1) : slug; | |
| const url = baseURL + "/" + shortSlug + rawSuffix; | |
| return getInfoFromCacheOrWeb(url); | |
| }; | |
| /** | |
| * Build the directory which will be used for the cache | |
| * @async | |
| * @returns {Promise} a Promise which resolves in undefined if the directory | |
| * has been created | |
| */ | |
| const initCache = function() { | |
| return new Promise((resolve, reject)=>{ | |
| if(fs.existsSync(cachePath)){ | |
| resolve(); | |
| } else { | |
| fs.mkdir(cachePath, err => { | |
| reject(err); | |
| }); | |
| } | |
| }); | |
| }; | |
| /** | |
| * Get a given file from the cache | |
| * @async | |
| * @param {string} filename the name of the file to get | |
| * @returns {Promise<string>} a promise which resolves with the content | |
| * of the file | |
| */ | |
| const getFileCache = function(filename) { | |
| return new Promise((resolve, reject) =>{ | |
| fs.readFile(cachePath + sep + filename, "utf8", (err, data)=>{ | |
| if(err) { | |
| reject(err); | |
| } else { | |
| resolve(data); | |
| } | |
| }); | |
| }); | |
| }; | |
| /** | |
| * Write a content file in the cache | |
| * @async | |
| * @param {string} filename the name for this file (e.g. "foo.txt") | |
| * @param {string} content the data to write into the file | |
| * @returns {Promise<string>} a promise which resolves with content when the file has | |
| * been created. | |
| */ | |
| const writeFileCache = function(filename, content){ | |
| return new Promise((resolve, reject) => { | |
| fs.writeFile(cachePath + sep + filename, content, (err) => { | |
| reject(err); | |
| }); | |
| resolve(content); | |
| }); | |
| }; | |
| /** | |
| * Get the JSON for the page containing the metadata for aaall the section. | |
| * E.g. https://developer.mozilla.org/en-US/docs/Web/HTML$children?expand | |
| * @async | |
| * @param {string} sectionName the short name of the section from MDN (e.g. "HTML") | |
| * @returns {Promise<string>} a Promise which resolves with the JSON content (whether from the cache or the web) | |
| */ | |
| const getRootPageInfo = function(section) { | |
| const rootSectionURL = baseURL + "/" + localeEn + "/" + section.prefix + "/" + section.name + detailSuffix; | |
| return getInfoFromCacheOrWeb(rootSectionURL); | |
| }; | |
| /** | |
| * Get the content of a web page from the web (basically a promisified-GET) | |
| * @async | |
| * @param {string} url the URL for the page we want to get (e.g. https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/Array$json) | |
| * @returns {Promise<string>} a Promise which resolves into the content (string) of the page. | |
| */ | |
| const getContentPromise = function(url) { | |
| return new Promise( (resolve, reject) => { | |
| const lib = url.startsWith("https") ? require("https") : require("http"); | |
| let error; | |
| console.log("Getting " + url + " from Web"); | |
| const request = lib.get(url, (response) => { | |
| if (response.statusCode !== 200) { | |
| error = new Error("Request Failed.\n" + | |
| `Status Code: ${response.statusCode}`); | |
| } | |
| if (error) { | |
| console.error(error); | |
| response.resume(); | |
| reject(error); | |
| } | |
| response.setEncoding("utf8"); | |
| const body = []; | |
| response.on("data", (chunk) => body.push(chunk)); | |
| response.on("end", () => resolve(body.join(""))); | |
| }); | |
| request.on("error", (err) => reject(err)); | |
| }); | |
| }; | |
| /** | |
| * From a JSON describing a section, list of all the subpages | |
| * in an array | |
| * @param {string} jsonContent | |
| * @returns {Array} an array of all of the subpages | |
| */ | |
| const extractSubpages = function(jsonContent) { | |
| const acc = []; | |
| jsonContent.subpages.forEach(el => { | |
| const infos = {url: el.url, last_edit: el.last_edit, locale: el.locale}; | |
| infos.locales = []; | |
| for(const localizedPage of el.translations) { | |
| const localeObj = { | |
| locale : localizedPage.locale, | |
| url : localizedPage.url, | |
| last_edit: localizedPage.last_edit | |
| }; | |
| infos.locales.push(localeObj); | |
| } | |
| acc.push(infos); | |
| if(el.subpages !== []) { | |
| acc.push(...extractSubpages(el)); | |
| } | |
| }); | |
| return acc; | |
| }; | |
| /** | |
| * Wait for some time (mainly to avoid being blocked due to rate-limiting) | |
| * @async | |
| * @param {number} ms the number of milliseconds to wait | |
| * @returns {Promise} | |
| */ | |
| function sleep(ms) { | |
| return new Promise(resolve=>{ | |
| setTimeout(resolve,ms); | |
| }); | |
| } | |
| /** | |
| * Test if the content string uses a given macro | |
| * @param {string} content | |
| * @param {string} macroName | |
| * @returns {boolean} true if the macro is used, false if not. | |
| */ | |
| function hasMacro(content, macroName) { | |
| const re = new RegExp("\\{\\{[ ]?" + macroName + "[ ]?\\(","ig"); | |
| return content.search(re) !== -1; | |
| } | |
| /** | |
| * Count the number of revisions from the source of a $history view | |
| * @param {string} content the content of the $history view | |
| * @returns {number} the number of revisions | |
| */ | |
| const parseCountRevision = function(content) { | |
| return content.match(/revision-list-comment/gi).length; | |
| }; | |
| /** | |
| * Test if the content string uses macros from a list | |
| * @param {string} content | |
| * @param {Array} macroList | |
| * @returns {object} an object where keys are macros' names and values are true or false | |
| */ | |
| function testMacros(content, macroList) { | |
| const recapObj = {}; | |
| macroList.forEach(macroName => {recapObj[macroName] = hasMacro(content, macroName);}); | |
| return recapObj; | |
| } | |
| /** | |
| * Test if the content string uses any macro of a given list | |
| * @param {string} content | |
| * @param {Array} macroList | |
| * @returns {boolean} true if one of the macro is used, false otherwise | |
| */ | |
| function hasAnyMacro(content, macroList) { | |
| return macroList.some(macroName => hasMacro(content, macroName)); | |
| } | |
| /** | |
| * Fetches the last datetime of edit coming from a users who is not in the core contributor | |
| * list | |
| * @param {string} locale | |
| * @param {string} content the content of an $history?limit=all view (HTML) | |
| * @param {Array<object>} coreUsersList an array of users each having locales where | |
| * not to "count" them | |
| */ | |
| function getLastEditTime(locale, content, coreUsersList){ | |
| const jsdom = require("jsdom"); | |
| const { JSDOM } = jsdom; | |
| const dom = new JSDOM(content); | |
| const matches = dom.window.document.querySelectorAll("ul.revision-list > li"); | |
| let datetime; | |
| for(const match of matches){ | |
| const author = match.querySelector(".revision-list-creator > a").textContent ; | |
| datetime = match.querySelector(".revision-list-date time").getAttribute("datetime"); | |
| if(!coreUsersList.some(user => (user.name === author && (!user.locales.includes(locale))))){ | |
| return datetime; | |
| } | |
| } | |
| return datetime; | |
| } | |
| /** | |
| * Translate a slug into a string which can be used as a filename | |
| * @param {string} url | |
| * @returns {string} a string which can be used as a filename (at least on Win) | |
| */ | |
| function encodeURLFileName(url){ | |
| return encodeURIComponent(url.replace(baseURL,"").replace("*","star")); | |
| } | |
| /** | |
| * Fetching the content of a given webpage from local filesystem | |
| * as cache or from the web server | |
| * @async | |
| * @param {string} url the url of the page we want the content from | |
| * @returns {Promise<string>} the content of the web page. | |
| */ | |
| const getInfoFromCacheOrWeb = function (url){ | |
| // Convert URL into filename | |
| const fileNameURL = encodeURLFileName(url); | |
| if(fileNameURL.length >= 255){ | |
| return Promise.reject(new Error("name too long")); | |
| } | |
| return initCache() | |
| .then(() => getFileCache(fileNameURL)) // Available from cache, nice | |
| .then( (content) => content, | |
| ()=>{ // Not available from cache, getting from the Web then | |
| console.log("Cache unavailable - Fetching from the Web :" + url); | |
| return getContentPromise(url).then( | |
| (content) => writeFileCache(fileNameURL, content) | |
| ); | |
| } | |
| ); | |
| }; | |
| /** | |
| * Write the content of the "report" (result of the process) | |
| * to the filesystem | |
| * @async | |
| * @param {string} content | |
| * @returns {Promise<string>} The content of the report (text) | |
| */ | |
| const writeResult = function(content){ | |
| return new Promise((resolve, reject) => { | |
| fs.writeFile(recapPath, content, (err) => { | |
| reject(err); | |
| }); | |
| resolve(content); | |
| }); | |
| }; | |
| /** | |
| * Wait a bit to comply with rate-limit (if necessary) then tests if a page has some macros | |
| * @async | |
| * @param {object} page Page object with a URL property pointing to the content | |
| * @param {Array} macroList a list of macros to test if they are in the page | |
| * @returns {Promise<object>} a promise which resolves into an object having a boolean property for each macro | |
| */ | |
| const processPage = function(page, macroList, section){ | |
| const defaultObj = {}; | |
| macroList.forEach(macroName => { | |
| defaultObj["has" + macroName] = false; | |
| }); | |
| defaultObj["hasSidebar"] = false; | |
| return getFileCache(encodeURLFileName(page.url + rawSuffix) ).then( | |
| () => sleep(1), // we avoid being ratelimited if content already in cache | |
| () => sleep(waitingTimeMS) | |
| ).then(() => getPageContent(page.url).then( | |
| content => { | |
| const macrosObj = testMacros(content, macroList); | |
| macrosObj.hasSidebar = hasAnyMacro(content, section.sidebarMacros); | |
| return macrosObj; | |
| }, | |
| () => defaultObj)); | |
| }; | |
| /** | |
| * Wait a bit to comply with rate-limit (if necessary) then count the number of revisions of | |
| * a given page | |
| * @async | |
| * @param {object} page Page object with a URL property pointing to the content | |
| * @returns {Promise<number>} a promise which resolves into the number of revisions of this page | |
| */ | |
| const processHistory = function(page){ | |
| return getFileCache(encodeURLFileName(page.url + rawSuffix)).then( | |
| () => sleep(1), // we avoid being ratelimited if content already in cache | |
| () => sleep(waitingTimeMSHistory) | |
| ).then(() => getPageContent(page.url + historySuffix) | |
| .then( | |
| content => { return { nbRevision: parseCountRevision(content), lastEditTime:getLastEditTime(page.locale, content, coreUsersList)};}, | |
| () => {return {nbRevision: 0, lastEditTime: ((new Date()).toISOString())};} | |
| ) | |
| ); | |
| }; | |
| /** | |
| * Basically "what to we do for a given section" | |
| * This function builds a promise that analyzes each page and localized pages | |
| * under a given section. We augment the existing JSON by adding properties | |
| * to know more about some macros usage and the number of revisions. | |
| * @param {string} section The "URL" name of the section of MDN (ex. HTML) | |
| * @returns {Promise<array>} a promise which resolves in array of objects following the $children?expand view with additional properties | |
| */ | |
| function buildSectionPromise(section){ | |
| return getRootPageInfo(section) // Getting the $children?expand view to have info over root, subpages and translations edit time | |
| .then(async (jsonContent) => { | |
| const content = JSON.parse(jsonContent); | |
| const subpages = extractSubpages(content); | |
| for (const page of subpages) { | |
| ({nbRevision: page.nbRevision, lastEditTime: page.lastEditTime} = await processHistory(page)); | |
| const macroResult = await processPage(page, macroList, section); | |
| macroList.map(macroName => { | |
| page["has" + macroName] = macroResult[macroName]; | |
| }); | |
| page.hasSidebar = macroResult.hasSidebar; | |
| for (const locale of page.locales) { | |
| ({nbRevision: locale.nbRevision, lastEditTime: locale.lastEditTime} = await processHistory(locale)); | |
| const macroLocaleResult = await processPage(locale, macroList, section); | |
| macroList.map(macroName => { | |
| locale["has" + macroName] = macroLocaleResult[macroName]; | |
| }); | |
| locale.hasSidebar = macroLocaleResult.hasSidebar; | |
| } | |
| } | |
| return subpages; | |
| }) | |
| .catch(err => {console.error(err);return "";}); | |
| } | |
| /** | |
| * This function applies buildSectionPromise in a synchronous fashion | |
| * (we're ratelimited anyway so we await) and collects the results of | |
| * each section | |
| * @async | |
| * @param {Array} sections | |
| * @returns {Array} an array of objects following the $children?expand view with additional properties | |
| */ | |
| async function processSections(sections){ | |
| let subpages = []; | |
| for (const section of sections){ | |
| subpages = subpages.concat(await buildSectionPromise(section)); | |
| } | |
| return subpages; | |
| } | |
| processSections(sections).then(subpages => { writeResult(JSON.stringify(subpages.flat()));}); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment