clhenrick · March 5, 2025 18:46
diff --git a/.nvmrc b/.nvmrc
 22.12.0
diff --git a/README.md b/README.md
diff --git a/index.js b/index.js
 #!/usr/bin/env zx

 import 'zx/globals';
 import * as cheerio from 'cheerio';
 import { chromium } from 'playwright';

 // modify this URL as you see fit; e.g. apply the desired filters then copy & paste it here
 const url = "https://www.w3.org/WAI/WCAG22/quickref/?currentsidebar=%23col_customize&showtechniques=214%2C131&levels=aaa&technologies=smil%2Cpdf";

 async function fetchWcagQuickRefHtml() {
  console.log('scraping WCAG quick reference...');
  const browser = await chromium.launch();
  const page = await browser.newPage();
  await page.goto(url);
  await page.waitForLoadState("networkidle");
  const html = await page.evaluate(() => {
    return document.documentElement.outerHTML;
  })
  await browser.close();
  return html;
 }

 function parseHtml(htmlStr) {
  console.log('parsing WCAG quick reference HTML...');
  const $ = cheerio.load(htmlStr);
  const $body = $('body');
  const $scArticles = $body.find("article.sc-wrapper.current");
  console.log(`Number of WCAG SC: ${$scArticles.length}`);

  const scData = $scArticles.extract({
    title: ['h4'],
    level: ['p.h4'],
    text: ['.sc-text > p:first-of-type'],
    understanding: [{
      'selector': 'div.understanding > a',
      value: 'href'
    }]
  });

  // not every SC article has a list, so we need to iterate over them separately
  scData.list = [];
  $scArticles.each(function(index, element) {
    // NOTE: list is either null or a JQ object
    const list = $(element).find(".sc-text ul").html();
    scData.list.push(list);
  });

  return scData;
 }

 function formatData(data) {
  console.log('formatting parsed data...');
  const { title, level, text, understanding, list } = data;
  const levelRegEx = /\(Added in 2\.\d\)/;
  const descRegEx = /Show Hide full description/;
  
  let i = 0;
  let l = title.length;
  let formattedText = "";
  
  for (i; i < l; i++) {
    const link = understanding[i];
    const titleText = title[i];
    const levelText = level[i].replace(levelRegEx, '');
    const descText = text[i].replace(descRegEx, '');
    const headingMarkup = `<h3><a href="${link}">${titleText} (${levelText})</a></h3>`;
    const descMarkup = `<p>${descText}</p>`;
    formattedText += `${headingMarkup}\n${descMarkup}\n`;

    if (list[i]) {
      const listMarkup = `<ul>${list[i]}</ul>`;
      formattedText += `${listMarkup}\n`;
    }
  }
  
  return formattedText;
 }

 async function writeHtmlFile(formattedText) {
  console.log('writing index.html file...');
  let contents = `<!DOCTYPE html><html lang="en"><body>${formattedText}</body></html>`;
  const cwd = process.cwd();
  const filePath = path.join(cwd, 'index.html');
  fs.writeFileSync(filePath, contents, { encoding: 'utf8' });
 }

 async function main() {
  try {
    const text = await fetchWcagQuickRefHtml();
    const data = parseHtml(text);
    const formatted = formatData(data);
    await writeHtmlFile(formatted);
    console.log('done!');
  } catch (error) {
    console.error(error);
    process.exit(1);
  }
  process.exit(0);
 }

 await main();
diff --git a/package.json b/package.json
 {
  "name": "wcag-quick-ref-scrape",
  "version": "1.0.0",
  "main": "index.js",
  "type": "module",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "[email protected]",
  "license": "UNLICENSED",
  "description": "",
  "dependencies": {
    "cheerio": "^1.0.0",
    "playwright": "^1.50.1",
    "zx": "^8.4.0"
  }
 }
	#!/usr/bin/env zx

	import 'zx/globals';
	import * as cheerio from 'cheerio';
	import { chromium } from 'playwright';

	// modify this URL as you see fit; e.g. apply the desired filters then copy & paste it here
	const url = "https://www.w3.org/WAI/WCAG22/quickref/?currentsidebar=%23col_customize&showtechniques=214%2C131&levels=aaa&technologies=smil%2Cpdf";

	async function fetchWcagQuickRefHtml() {
	console.log('scraping WCAG quick reference...');
	const browser = await chromium.launch();
	const page = await browser.newPage();
	await page.goto(url);
	await page.waitForLoadState("networkidle");
	const html = await page.evaluate(() => {
	return document.documentElement.outerHTML;
	})
	await browser.close();
	return html;
	}

	function parseHtml(htmlStr) {
	console.log('parsing WCAG quick reference HTML...');
	const $ = cheerio.load(htmlStr);
	const $body = $('body');
	const $scArticles = $body.find("article.sc-wrapper.current");
	console.log(`Number of WCAG SC: ${$scArticles.length}`);

	const scData = $scArticles.extract({
	title: ['h4'],
	level: ['p.h4'],
	text: ['.sc-text > p:first-of-type'],
	understanding: [{
	'selector': 'div.understanding > a',
	value: 'href'
	}]
	});

	// not every SC article has a list, so we need to iterate over them separately
	scData.list = [];
	$scArticles.each(function(index, element) {
	// NOTE: list is either null or a JQ object
	const list = $(element).find(".sc-text ul").html();
	scData.list.push(list);
	});

	return scData;
	}

	function formatData(data) {
	console.log('formatting parsed data...');
	const { title, level, text, understanding, list } = data;
	const levelRegEx = /\(Added in 2\.\d\)/;
	const descRegEx = /Show Hide full description/;

	let i = 0;
	let l = title.length;
	let formattedText = "";

	for (i; i < l; i++) {
	const link = understanding[i];
	const titleText = title[i];
	const levelText = level[i].replace(levelRegEx, '');
	const descText = text[i].replace(descRegEx, '');
	const headingMarkup = `<h3><a href="${link}">${titleText} (${levelText})</a></h3>`;
	const descMarkup = `<p>${descText}</p>`;
	formattedText += `${headingMarkup}\n${descMarkup}\n`;

	if (list[i]) {
	const listMarkup = `<ul>${list[i]}</ul>`;
	formattedText += `${listMarkup}\n`;
	}
	}

	return formattedText;
	}

	async function writeHtmlFile(formattedText) {
	console.log('writing index.html file...');
	let contents = `<!DOCTYPE html><html lang="en"><body>${formattedText}</body></html>`;
	const cwd = process.cwd();
	const filePath = path.join(cwd, 'index.html');
	fs.writeFileSync(filePath, contents, { encoding: 'utf8' });
	}

	async function main() {
	try {
	const text = await fetchWcagQuickRefHtml();
	const data = parseHtml(text);
	const formatted = formatData(data);
	await writeHtmlFile(formatted);
	console.log('done!');
	} catch (error) {
	console.error(error);
	process.exit(1);
	}
	process.exit(0);
	}

	await main();
	{
	"name": "wcag-quick-ref-scrape",
	"version": "1.0.0",
	"main": "index.js",
	"type": "module",
	"scripts": {
	"test": "echo \"Error: no test specified\" && exit 1"
	},
	"author": "[email protected]",
	"license": "UNLICENSED",
	"description": "",
	"dependencies": {
	"cheerio": "^1.0.0",
	"playwright": "^1.50.1",
	"zx": "^8.4.0"
	}
	}