|
#!/usr/bin/env zx |
|
|
|
import 'zx/globals'; |
|
import * as cheerio from 'cheerio'; |
|
import { chromium } from 'playwright'; |
|
|
|
// modify this URL as you see fit; e.g. apply the desired filters then copy & paste it here |
|
const url = "https://www.w3.org/WAI/WCAG22/quickref/?currentsidebar=%23col_customize&showtechniques=214%2C131&levels=aaa&technologies=smil%2Cpdf"; |
|
|
|
async function fetchWcagQuickRefHtml() { |
|
console.log('scraping WCAG quick reference...'); |
|
const browser = await chromium.launch(); |
|
const page = await browser.newPage(); |
|
await page.goto(url); |
|
await page.waitForLoadState("networkidle"); |
|
const html = await page.evaluate(() => { |
|
return document.documentElement.outerHTML; |
|
}) |
|
await browser.close(); |
|
return html; |
|
} |
|
|
|
function parseHtml(htmlStr) { |
|
console.log('parsing WCAG quick reference HTML...'); |
|
const $ = cheerio.load(htmlStr); |
|
const $body = $('body'); |
|
const $scArticles = $body.find("article.sc-wrapper.current"); |
|
console.log(`Number of WCAG SC: ${$scArticles.length}`); |
|
|
|
const scData = $scArticles.extract({ |
|
title: ['h4'], |
|
level: ['p.h4'], |
|
text: ['.sc-text > p:first-of-type'], |
|
understanding: [{ |
|
'selector': 'div.understanding > a', |
|
value: 'href' |
|
}] |
|
}); |
|
|
|
// not every SC article has a list, so we need to iterate over them separately |
|
scData.list = []; |
|
$scArticles.each(function(index, element) { |
|
// NOTE: list is either null or a JQ object |
|
const list = $(element).find(".sc-text ul").html(); |
|
scData.list.push(list); |
|
}); |
|
|
|
return scData; |
|
} |
|
|
|
function formatData(data) { |
|
console.log('formatting parsed data...'); |
|
const { title, level, text, understanding, list } = data; |
|
const levelRegEx = /\(Added in 2\.\d\)/; |
|
const descRegEx = /Show Hide full description/; |
|
|
|
let i = 0; |
|
let l = title.length; |
|
let formattedText = ""; |
|
|
|
for (i; i < l; i++) { |
|
const link = understanding[i]; |
|
const titleText = title[i]; |
|
const levelText = level[i].replace(levelRegEx, ''); |
|
const descText = text[i].replace(descRegEx, ''); |
|
const headingMarkup = `<h3><a href="${link}">${titleText} (${levelText})</a></h3>`; |
|
const descMarkup = `<p>${descText}</p>`; |
|
formattedText += `${headingMarkup}\n${descMarkup}\n`; |
|
|
|
if (list[i]) { |
|
const listMarkup = `<ul>${list[i]}</ul>`; |
|
formattedText += `${listMarkup}\n`; |
|
} |
|
} |
|
|
|
return formattedText; |
|
} |
|
|
|
async function writeHtmlFile(formattedText) { |
|
console.log('writing index.html file...'); |
|
let contents = `<!DOCTYPE html><html lang="en"><body>${formattedText}</body></html>`; |
|
const cwd = process.cwd(); |
|
const filePath = path.join(cwd, 'index.html'); |
|
fs.writeFileSync(filePath, contents, { encoding: 'utf8' }); |
|
} |
|
|
|
async function main() { |
|
try { |
|
const text = await fetchWcagQuickRefHtml(); |
|
const data = parseHtml(text); |
|
const formatted = formatData(data); |
|
await writeHtmlFile(formatted); |
|
console.log('done!'); |
|
} catch (error) { |
|
console.error(error); |
|
process.exit(1); |
|
} |
|
process.exit(0); |
|
} |
|
|
|
await main(); |