Skip to content

Instantly share code, notes, and snippets.

@clhenrick
Created March 5, 2025 18:46
Show Gist options
  • Save clhenrick/e0d8687f359b551d1b9b6ff9c0fdadbc to your computer and use it in GitHub Desktop.
Save clhenrick/e0d8687f359b551d1b9b6ff9c0fdadbc to your computer and use it in GitHub Desktop.
WCAG Quick Reference Scraper

WCAG Quick Reference Web Scrape

A NodeJS script that scrapes the WCAG Quick Reference Guide with filters applied. Success Criteria that match the filters are scraped and reformatted into HTML that may be copied and pasted into Confluence.

Install

Ensure that the correct version of NodeJS is being used:

nvm use

Install dependencies via NPM:

npm install

Run Script

node index.js

or

npx zx index.js

Watch Mode

Note: Requires NodeJS v22 or greater.

node --watch index.js
#!/usr/bin/env zx
import 'zx/globals';
import * as cheerio from 'cheerio';
import { chromium } from 'playwright';
// modify this URL as you see fit; e.g. apply the desired filters then copy & paste it here
const url = "https://www.w3.org/WAI/WCAG22/quickref/?currentsidebar=%23col_customize&showtechniques=214%2C131&levels=aaa&technologies=smil%2Cpdf";
async function fetchWcagQuickRefHtml() {
console.log('scraping WCAG quick reference...');
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto(url);
await page.waitForLoadState("networkidle");
const html = await page.evaluate(() => {
return document.documentElement.outerHTML;
})
await browser.close();
return html;
}
function parseHtml(htmlStr) {
console.log('parsing WCAG quick reference HTML...');
const $ = cheerio.load(htmlStr);
const $body = $('body');
const $scArticles = $body.find("article.sc-wrapper.current");
console.log(`Number of WCAG SC: ${$scArticles.length}`);
const scData = $scArticles.extract({
title: ['h4'],
level: ['p.h4'],
text: ['.sc-text > p:first-of-type'],
understanding: [{
'selector': 'div.understanding > a',
value: 'href'
}]
});
// not every SC article has a list, so we need to iterate over them separately
scData.list = [];
$scArticles.each(function(index, element) {
// NOTE: list is either null or a JQ object
const list = $(element).find(".sc-text ul").html();
scData.list.push(list);
});
return scData;
}
function formatData(data) {
console.log('formatting parsed data...');
const { title, level, text, understanding, list } = data;
const levelRegEx = /\(Added in 2\.\d\)/;
const descRegEx = /Show Hide full description/;
let i = 0;
let l = title.length;
let formattedText = "";
for (i; i < l; i++) {
const link = understanding[i];
const titleText = title[i];
const levelText = level[i].replace(levelRegEx, '');
const descText = text[i].replace(descRegEx, '');
const headingMarkup = `<h3><a href="${link}">${titleText} (${levelText})</a></h3>`;
const descMarkup = `<p>${descText}</p>`;
formattedText += `${headingMarkup}\n${descMarkup}\n`;
if (list[i]) {
const listMarkup = `<ul>${list[i]}</ul>`;
formattedText += `${listMarkup}\n`;
}
}
return formattedText;
}
async function writeHtmlFile(formattedText) {
console.log('writing index.html file...');
let contents = `<!DOCTYPE html><html lang="en"><body>${formattedText}</body></html>`;
const cwd = process.cwd();
const filePath = path.join(cwd, 'index.html');
fs.writeFileSync(filePath, contents, { encoding: 'utf8' });
}
async function main() {
try {
const text = await fetchWcagQuickRefHtml();
const data = parseHtml(text);
const formatted = formatData(data);
await writeHtmlFile(formatted);
console.log('done!');
} catch (error) {
console.error(error);
process.exit(1);
}
process.exit(0);
}
await main();
{
"name": "wcag-quick-ref-scrape",
"version": "1.0.0",
"main": "index.js",
"type": "module",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "[email protected]",
"license": "UNLICENSED",
"description": "",
"dependencies": {
"cheerio": "^1.0.0",
"playwright": "^1.50.1",
"zx": "^8.4.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment