Skip to content

Instantly share code, notes, and snippets.

@SphinxKnight
Last active February 18, 2019 07:13
Show Gist options
  • Save SphinxKnight/f3b232450fea15cf0118ed2a207b6060 to your computer and use it in GitHub Desktop.
Save SphinxKnight/f3b232450fea15cf0118ed2a207b6060 to your computer and use it in GitHub Desktop.
MDN static scan
const os = require("os");
const fs = require("fs");
const { sep } = require("path");
const sqlite3 = require("sqlite3");
const dbName = "mdn_stats.db";
const dbPath = os.tmpdir() + sep + "scanMDNstats" + sep + dbName;
const cachePath = os.tmpdir() + sep + "scanMDNstats";
const recapFileName = "recap.json";
const macroList = ["Compat", "Languages", "EmbedInteractiveExample"];
/**
* Get a given file from the cache
* @async
* @param {string} filename the name of the file to get
* @returns {Promise<string>} a promise which resolves with the content
* of the file
*/
const getFileCache = function(filename) {
return new Promise((resolve, reject) =>{
fs.readFile(cachePath + sep + filename, "utf8", (err, data)=>{
if(err) {
reject(err);
} else {
resolve(data);
}
});
});
};
// Insert localized pages
const createDBPage = function(page, parentPageURL){
const fieldStrings = macroList.map(macroName => "has_" + macroName.toLowerCase()).join(", ");
const fieldAliases = macroList.map(macroName => "$has_" + macroName.toLowerCase()).join(", ");
const fieldContent = {
$last_edit: page.last_edit,
$last_real_edit: page.lastEditTime,
$url: page.url,
$locale: page.locale,
$parent_page: parentPageURL,
$nb_rev: page.nbRevision,
$has_sidebar: page.hasSidebar
};
macroList.forEach(macroName => fieldContent["$has_" + macroName.toLowerCase()] = (page["has" + macroName] ? 1 : 0));
const stmt = db.prepare("INSERT INTO page (last_edit, last_real_edit, url, locale," + fieldStrings + ", has_sidebar, parent_page, nb_rev) VALUES ($last_edit, $last_real_edit, $url, $locale," + fieldAliases + ", $has_sidebar, $parent_page, $nb_rev)", fieldContent);
stmt.run();
stmt.finalize();
};
const db = new sqlite3.Database(dbPath);
const tableDefsMacro = macroList.map(macroName => "has_" + macroName.toLowerCase() + " INTEGER NOT NULL").join(", ");
db.run("CREATE TABLE page ( last_edit TEXT NOT NULL, last_real_edit TEXT NOT NULL, url TEXT NOT NULL, locale TEXT NOT NULL," + tableDefsMacro + ", has_sidebar TEXT, parent_page TEXT, nb_rev INTEGER NOT NULL, PRIMARY KEY(url))", [], ()=>undefined);
getFileCache(recapFileName).then(content => {
const allPages = JSON.parse(content);
allPages.forEach(page => createDBPage(page,null));
allPages.forEach(page => {
page.locales.forEach(pageL => createDBPage(pageL,page.url));
});
});
const os = require("os");
const fs = require("fs");
const { sep } = require("path");
const waitingTimeMS = 120;
const waitingTimeMSHistory = 3000;
const localeEn = "en-US";
const sections = [
{name: "Learn", prefix: "docs", sidebarMacros: ["LearnSidebar"]},
{name: "Mozilla", prefix: "docs", sidebarMacros: []},
{name: "HTML", prefix: "docs/Web", sidebarMacros: ["HTMLRef", "HTMLSidebar"]},
{name: "CSS", prefix: "docs/Web", sidebarMacros: ["CSSRef"]},
{name: "JavaScript", prefix: "docs/Web", sidebarMacros: ["JSRef", "JSSidebar"]}
];
const coreUsersList = [
{name: "fscholz" , locales: ["en-US"]},
{name: "wbamberg" , locales: ["en-US"]},
{name: "chrisdavidmills" , locales: ["en-US"]},
{name: "Sheppy" , locales: ["en-US"]},
{name: "jswisher" , locales: ["en-US"]},
{name: "Jeremie" , locales: ["en-US"]},
{name: "teoli" , locales: ["en-US"]},
{name: "jwhitlock" , locales: ["en-US"]},
{name: "SphinxKnight" , locales: ["en-US","fr"]}
];
const baseURL = "https://developer.mozilla.org";
const detailSuffix = "$children?expand";
const historySuffix = "$history?limit=all";
const rawSuffix = "?raw";
const macroList = ["Compat", "Languages", "EmbedInteractiveExample"];
const cachePath = os.tmpdir() + sep + "scanMDNstats";
const recapPath = os.tmpdir() + sep + "scanMDNstats" + sep + "recap.json";
/**
* Get the content of a given page through cache or web
* Uses a Promise.
* @async
* @param {string} slug The slug "/<locale>/docs/X/Y" pointing to the page
* @returns {Promise<string>} a Promise which resolves into the content of the page
*/
const getPageContent = function(slug) {
const shortSlug = slug.startsWith("/") ? slug.substring(1) : slug;
const url = baseURL + "/" + shortSlug + rawSuffix;
return getInfoFromCacheOrWeb(url);
};
/**
* Build the directory which will be used for the cache
* @async
* @returns {Promise} a Promise which resolves in undefined if the directory
* has been created
*/
const initCache = function() {
return new Promise((resolve, reject)=>{
if(fs.existsSync(cachePath)){
resolve();
} else {
fs.mkdir(cachePath, err => {
reject(err);
});
}
});
};
/**
* Get a given file from the cache
* @async
* @param {string} filename the name of the file to get
* @returns {Promise<string>} a promise which resolves with the content
* of the file
*/
const getFileCache = function(filename) {
return new Promise((resolve, reject) =>{
fs.readFile(cachePath + sep + filename, "utf8", (err, data)=>{
if(err) {
reject(err);
} else {
resolve(data);
}
});
});
};
/**
* Write a content file in the cache
* @async
* @param {string} filename the name for this file (e.g. "foo.txt")
* @param {string} content the data to write into the file
* @returns {Promise<string>} a promise which resolves with content when the file has
* been created.
*/
const writeFileCache = function(filename, content){
return new Promise((resolve, reject) => {
fs.writeFile(cachePath + sep + filename, content, (err) => {
reject(err);
});
resolve(content);
});
};
/**
* Get the JSON for the page containing the metadata for aaall the section.
* E.g. https://developer.mozilla.org/en-US/docs/Web/HTML$children?expand
* @async
* @param {string} sectionName the short name of the section from MDN (e.g. "HTML")
* @returns {Promise<string>} a Promise which resolves with the JSON content (whether from the cache or the web)
*/
const getRootPageInfo = function(section) {
const rootSectionURL = baseURL + "/" + localeEn + "/" + section.prefix + "/" + section.name + detailSuffix;
return getInfoFromCacheOrWeb(rootSectionURL);
};
/**
* Get the content of a web page from the web (basically a promisified-GET)
* @async
* @param {string} url the URL for the page we want to get (e.g. https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/Array$json)
* @returns {Promise<string>} a Promise which resolves into the content (string) of the page.
*/
const getContentPromise = function(url) {
return new Promise( (resolve, reject) => {
const lib = url.startsWith("https") ? require("https") : require("http");
let error;
console.log("Getting " + url + " from Web");
const request = lib.get(url, (response) => {
if (response.statusCode !== 200) {
error = new Error("Request Failed.\n" +
`Status Code: ${response.statusCode}`);
}
if (error) {
console.error(error);
response.resume();
reject(error);
}
response.setEncoding("utf8");
const body = [];
response.on("data", (chunk) => body.push(chunk));
response.on("end", () => resolve(body.join("")));
});
request.on("error", (err) => reject(err));
});
};
/**
* From a JSON describing a section, list of all the subpages
* in an array
* @param {string} jsonContent
* @returns {Array} an array of all of the subpages
*/
const extractSubpages = function(jsonContent) {
const acc = [];
jsonContent.subpages.forEach(el => {
const infos = {url: el.url, last_edit: el.last_edit, locale: el.locale};
infos.locales = [];
for(const localizedPage of el.translations) {
const localeObj = {
locale : localizedPage.locale,
url : localizedPage.url,
last_edit: localizedPage.last_edit
};
infos.locales.push(localeObj);
}
acc.push(infos);
if(el.subpages !== []) {
acc.push(...extractSubpages(el));
}
});
return acc;
};
/**
* Wait for some time (mainly to avoid being blocked due to rate-limiting)
* @async
* @param {number} ms the number of milliseconds to wait
* @returns {Promise}
*/
function sleep(ms) {
return new Promise(resolve=>{
setTimeout(resolve,ms);
});
}
/**
* Test if the content string uses a given macro
* @param {string} content
* @param {string} macroName
* @returns {boolean} true if the macro is used, false if not.
*/
function hasMacro(content, macroName) {
const re = new RegExp("\\{\\{[ ]?" + macroName + "[ ]?\\(","ig");
return content.search(re) !== -1;
}
/**
* Count the number of revisions from the source of a $history view
* @param {string} content the content of the $history view
* @returns {number} the number of revisions
*/
const parseCountRevision = function(content) {
return content.match(/revision-list-comment/gi).length;
};
/**
* Test if the content string uses macros from a list
* @param {string} content
* @param {Array} macroList
* @returns {object} an object where keys are macros' names and values are true or false
*/
function testMacros(content, macroList) {
const recapObj = {};
macroList.forEach(macroName => {recapObj[macroName] = hasMacro(content, macroName);});
return recapObj;
}
/**
* Test if the content string uses any macro of a given list
* @param {string} content
* @param {Array} macroList
* @returns {boolean} true if one of the macro is used, false otherwise
*/
function hasAnyMacro(content, macroList) {
return macroList.some(macroName => hasMacro(content, macroName));
}
/**
* Fetches the last datetime of edit coming from a users who is not in the core contributor
* list
* @param {string} locale
* @param {string} content the content of an $history?limit=all view (HTML)
* @param {Array<object>} coreUsersList an array of users each having locales where
* not to "count" them
*/
function getLastEditTime(locale, content, coreUsersList){
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const dom = new JSDOM(content);
const matches = dom.window.document.querySelectorAll("ul.revision-list > li");
let datetime;
for(const match of matches){
const author = match.querySelector(".revision-list-creator > a").textContent ;
datetime = match.querySelector(".revision-list-date time").getAttribute("datetime");
if(!coreUsersList.some(user => (user.name === author && (!user.locales.includes(locale))))){
return datetime;
}
}
return datetime;
}
/**
* Translate a slug into a string which can be used as a filename
* @param {string} url
* @returns {string} a string which can be used as a filename (at least on Win)
*/
function encodeURLFileName(url){
return encodeURIComponent(url.replace(baseURL,"").replace("*","star"));
}
/**
* Fetching the content of a given webpage from local filesystem
* as cache or from the web server
* @async
* @param {string} url the url of the page we want the content from
* @returns {Promise<string>} the content of the web page.
*/
const getInfoFromCacheOrWeb = function (url){
// Convert URL into filename
const fileNameURL = encodeURLFileName(url);
if(fileNameURL.length >= 255){
return Promise.reject(new Error("name too long"));
}
return initCache()
.then(() => getFileCache(fileNameURL)) // Available from cache, nice
.then( (content) => content,
()=>{ // Not available from cache, getting from the Web then
console.log("Cache unavailable - Fetching from the Web :" + url);
return getContentPromise(url).then(
(content) => writeFileCache(fileNameURL, content)
);
}
);
};
/**
* Write the content of the "report" (result of the process)
* to the filesystem
* @async
* @param {string} content
* @returns {Promise<string>} The content of the report (text)
*/
const writeResult = function(content){
return new Promise((resolve, reject) => {
fs.writeFile(recapPath, content, (err) => {
reject(err);
});
resolve(content);
});
};
/**
* Wait a bit to comply with rate-limit (if necessary) then tests if a page has some macros
* @async
* @param {object} page Page object with a URL property pointing to the content
* @param {Array} macroList a list of macros to test if they are in the page
* @returns {Promise<object>} a promise which resolves into an object having a boolean property for each macro
*/
const processPage = function(page, macroList, section){
const defaultObj = {};
macroList.forEach(macroName => {
defaultObj["has" + macroName] = false;
});
defaultObj["hasSidebar"] = false;
return getFileCache(encodeURLFileName(page.url + rawSuffix) ).then(
() => sleep(1), // we avoid being ratelimited if content already in cache
() => sleep(waitingTimeMS)
).then(() => getPageContent(page.url).then(
content => {
const macrosObj = testMacros(content, macroList);
macrosObj.hasSidebar = hasAnyMacro(content, section.sidebarMacros);
return macrosObj;
},
() => defaultObj));
};
/**
* Wait a bit to comply with rate-limit (if necessary) then count the number of revisions of
* a given page
* @async
* @param {object} page Page object with a URL property pointing to the content
* @returns {Promise<number>} a promise which resolves into the number of revisions of this page
*/
const processHistory = function(page){
return getFileCache(encodeURLFileName(page.url + rawSuffix)).then(
() => sleep(1), // we avoid being ratelimited if content already in cache
() => sleep(waitingTimeMSHistory)
).then(() => getPageContent(page.url + historySuffix)
.then(
content => { return { nbRevision: parseCountRevision(content), lastEditTime:getLastEditTime(page.locale, content, coreUsersList)};},
() => {return {nbRevision: 0, lastEditTime: ((new Date()).toISOString())};}
)
);
};
/**
* Basically "what to we do for a given section"
* This function builds a promise that analyzes each page and localized pages
* under a given section. We augment the existing JSON by adding properties
* to know more about some macros usage and the number of revisions.
* @param {string} section The "URL" name of the section of MDN (ex. HTML)
* @returns {Promise<array>} a promise which resolves in array of objects following the $children?expand view with additional properties
*/
function buildSectionPromise(section){
return getRootPageInfo(section) // Getting the $children?expand view to have info over root, subpages and translations edit time
.then(async (jsonContent) => {
const content = JSON.parse(jsonContent);
const subpages = extractSubpages(content);
for (const page of subpages) {
({nbRevision: page.nbRevision, lastEditTime: page.lastEditTime} = await processHistory(page));
const macroResult = await processPage(page, macroList, section);
macroList.map(macroName => {
page["has" + macroName] = macroResult[macroName];
});
page.hasSidebar = macroResult.hasSidebar;
for (const locale of page.locales) {
({nbRevision: locale.nbRevision, lastEditTime: locale.lastEditTime} = await processHistory(locale));
const macroLocaleResult = await processPage(locale, macroList, section);
macroList.map(macroName => {
locale["has" + macroName] = macroLocaleResult[macroName];
});
locale.hasSidebar = macroLocaleResult.hasSidebar;
}
}
return subpages;
})
.catch(err => {console.error(err);return "";});
}
/**
* This function applies buildSectionPromise in a synchronous fashion
* (we're ratelimited anyway so we await) and collects the results of
* each section
* @async
* @param {Array} sections
* @returns {Array} an array of objects following the $children?expand view with additional properties
*/
async function processSections(sections){
let subpages = [];
for (const section of sections){
subpages = subpages.concat(await buildSectionPromise(section));
}
return subpages;
}
processSections(sections).then(subpages => { writeResult(JSON.stringify(subpages.flat()));});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment