nberlette · February 27, 2024 10:35
diff --git a/extract-workers-ai-models.ts b/extract-workers-ai-models.ts
 /*!
 * Run this script with Deno to extract all models from the Workers AI
 * documentation site (https://developers.cloudflare.com/workers-ai/models).
 *
 * @example
 * ```sh
 * # install Deno if you do not have it already
 * curl -fsSL https://deno.land/install.sh | sh -
 *
 * # run the script
 * deno run --allow-net ./extract-workers-ai-models.ts
 *
 * # example output:
 * # [
 * # {
 * #   name: "@hf/thebloke/llama-2-13b-chat-awq",
 * #   urls: [
 * #     [
 * #       "https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ",
 * #       "More information"
 * #     ]
 * #   ],
 * #   info: "Llama 2 13B Chat AWQ is an efficient, accurate"... 117 more characters,
 * #   data: {
 * #     default_max_sequence_tokens_stream: 512,
 * #     default_max_sequence_tokens: 256
 * #   }
 * # },
 * # ...
 * # ]
 * ```
 */
 import {
  Document,
  DOMParser,
  Element,
 } from "https://deno.land/x/[email protected]/deno-dom-wasm.ts";

 export interface Model {
  readonly name: string;
  readonly info: string;
  readonly urls: (readonly [url: string | URL, text: string])[];
  readonly data: Record<string, number>;
 }

 /**
 * Fetches the HTML content for a given URL.
 * @param url The URL to fetch.
 * @returns The HTML content of the fetched URL.
 */
 async function loadHTML(url: string, referrer: string): Promise<string> {
  const response = await fetch(url, {
    headers: {
      "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    },
    method: "GET",
    mode: "cors",
    redirect: "follow",
    referrer,
  });
  if (!response.ok) {
    throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
  }
  return await response.text();
 }

 const parser = new DOMParser(), { parseFromString: parseDOM } = parser;
 const parseHTML = (html: string, type = "text/html") => parseDOM(html, type)!;

 async function loadDocument(url: string | URL, referrer = url): Promise<Document> {
  const html = await loadHTML(url.toString(), referrer.toString());
  return parser.parseFromString(html, "text/html")!;
 }

 /**
 * Extracts model information from a given HTML document.
 *
 * @param html The document HTML from which to extract models.
 * @returns The list of models extracted from the document.
 */
 function extractModels(html: string): Model[] {
  const document = parser.parseFromString(html, "text/html")!;

  return Array.from(
    document.querySelectorAll(
      `#main > article > table > tbody > tr > td > code`,
    ),
  ).map((c) => {
    const name = c.textCondtent!;
    let rhs = c.parentNode!.nextSibling as Element;
    while (rhs.nodeName )
    const rhsLinks = rhs.querySelectorAll("a[href]");
    const rhsInnerText = rhs.innerText!;
    let rhsText = rhs.childNodes[0].innerText || rhsInnerText;
    const dataRegExp = (/(?<=[a-z.])(?:Default max|Max |languages: ((?:(\w+)(?:, |))+))/g);
    const uselessRegExp = /(?:Open external link|External link icon)\n?/g;
    const usefulRegExp = /(More information|Terms and license)\n?/g;
    const urlRegExp = /https?:\/\/\S+/g;
    rhsText = rhsText.trim().replaceAll(uselessRegExp, "");
    let rhsHTML = rhs.innerHTML;
    const links = Array.from(
      rhsLinks.length ? rhsLinks :
      rhsText.matchAll(urlRegExp) ?? [],
    ).flat(2).map((a) => a as Element);

    const urls = Array.from<Element>(links).map((a) =>
        [
          a.getAttribute("href")!,
          a.innerText.match(usefulRegExp)?.[0] ?? "More information",
        ] as [string, string]
      );
    const info = rhs.innerText!.replaceAll(
      /(More infomation)\s*/mg,
      `[$1](${urls[0]?.[0] ?? "#"})`,
    );

    const data = Array.from(
      String(rhs.innerHTML).matchAll(/<strong>(.+?)<\/strong>:\s*(\d+)/g) ?? [],
    ).reduce((acc, [, key, value]) => {
      const normalizedKey = key.toLowerCase().replace(/[^a-z0-9_]+/g, "_")
        .replace(/^_|_$|(?<=_)_+/g, "");
      acc[normalizedKey] = parseInt(value, 10);
      return acc;
    }, {} as Record<string, number>);

    return { name, urls, info, data };
  });
 }

 /**
 * Fetches and parses models from each category.
 */
 export async function listAllModels(): Promise<Model[]> {
  const categories: [string, string][] = [
    ["text-generation", "Text Generation"],
    ["speech-recognition", "Automatic Speech Recognition"],
    ["translation", "Translation"],
    ["text-classification", "Text Classification"],
    ["image-classification", "Image Classification"],
    ["text-to-image", "Text-to-Image"],
    ["text-embeddings", "Text Embeddings"],
  ];
  const baseUrl: string = "https://developers.cloudflare.com/workers-ai/models";

  const models: Model[] = [];

  for (const [slug] of categories) {
    const url = `${baseUrl}/${slug}`;
    try {
      const html = await loadHTML(url, baseUrl);
      const categoryModels = extractModels(html);
      models.push(...categoryModels);
    } catch { /* ignore */ }
  }

  console.log("All models fetched:", models);

  return models;
 }

 if (import.meta.main) {
  console.log(await listAllModels());
 }
	/*!
	* Run this script with Deno to extract all models from the Workers AI
	* documentation site (https://developers.cloudflare.com/workers-ai/models).
	*
	* @example
	* ```sh
	* # install Deno if you do not have it already
	* curl -fsSL https://deno.land/install.sh \| sh -
	*
	* # run the script
	* deno run --allow-net ./extract-workers-ai-models.ts
	*
	* # example output:
	* # [
	* # {
	* # name: "@hf/thebloke/llama-2-13b-chat-awq",
	* # urls: [
	* # [
	* # "https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ",
	* # "More information"
	* # ]
	* # ],
	* # info: "Llama 2 13B Chat AWQ is an efficient, accurate"... 117 more characters,
	* # data: {
	* # default_max_sequence_tokens_stream: 512,
	* # default_max_sequence_tokens: 256
	* # }
	* # },
	* # ...
	* # ]
	* ```
	*/
	import {
	Document,
	DOMParser,
	Element,
	} from "https://deno.land/x/[email protected]/deno-dom-wasm.ts";

	export interface Model {
	readonly name: string;
	readonly info: string;
	readonly urls: (readonly [url: string \| URL, text: string])[];
	readonly data: Record<string, number>;
	}

	/**
	* Fetches the HTML content for a given URL.
	* @param url The URL to fetch.
	* @returns The HTML content of the fetched URL.
	*/
	async function loadHTML(url: string, referrer: string): Promise<string> {
	const response = await fetch(url, {
	headers: {
	"User-Agent":
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
	},
	method: "GET",
	mode: "cors",
	redirect: "follow",
	referrer,
	});
	if (!response.ok) {
	throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
	}
	return await response.text();
	}

	const parser = new DOMParser(), { parseFromString: parseDOM } = parser;
	const parseHTML = (html: string, type = "text/html") => parseDOM(html, type)!;

	async function loadDocument(url: string \| URL, referrer = url): Promise<Document> {
	const html = await loadHTML(url.toString(), referrer.toString());
	return parser.parseFromString(html, "text/html")!;
	}

	/**
	* Extracts model information from a given HTML document.
	*
	* @param html The document HTML from which to extract models.
	* @returns The list of models extracted from the document.
	*/
	function extractModels(html: string): Model[] {
	const document = parser.parseFromString(html, "text/html")!;

	return Array.from(
	document.querySelectorAll(
	`#main > article > table > tbody > tr > td > code`,
	),
	).map((c) => {
	const name = c.textCondtent!;
	let rhs = c.parentNode!.nextSibling as Element;
	while (rhs.nodeName )
	const rhsLinks = rhs.querySelectorAll("a[href]");
	const rhsInnerText = rhs.innerText!;
	let rhsText = rhs.childNodes[0].innerText \|\| rhsInnerText;
	const dataRegExp = (/(?<=[a-z.])(?:Default max\|Max \|languages: ((?:(\w+)(?:, \|))+))/g);
	const uselessRegExp = /(?:Open external link\|External link icon)\n?/g;
	const usefulRegExp = /(More information\|Terms and license)\n?/g;
	const urlRegExp = /https?:\/\/\S+/g;
	rhsText = rhsText.trim().replaceAll(uselessRegExp, "");
	let rhsHTML = rhs.innerHTML;
	const links = Array.from(
	rhsLinks.length ? rhsLinks :
	rhsText.matchAll(urlRegExp) ?? [],
	).flat(2).map((a) => a as Element);

	const urls = Array.from<Element>(links).map((a) =>
	[
	a.getAttribute("href")!,
	a.innerText.match(usefulRegExp)?.[0] ?? "More information",
	] as [string, string]
	);
	const info = rhs.innerText!.replaceAll(
	/(More infomation)\s*/mg,
	`[$1](${urls[0]?.[0] ?? "#"})`,
	);

	const data = Array.from(
	String(rhs.innerHTML).matchAll(/<strong>(.+?)<\/strong>:\s*(\d+)/g) ?? [],
	).reduce((acc, [, key, value]) => {
	const normalizedKey = key.toLowerCase().replace(/[^a-z0-9_]+/g, "_")
	.replace(/^_\|_$\|(?<=_)_+/g, "");
	acc[normalizedKey] = parseInt(value, 10);
	return acc;
	}, {} as Record<string, number>);

	return { name, urls, info, data };
	});
	}

	/**
	* Fetches and parses models from each category.
	*/
	export async function listAllModels(): Promise<Model[]> {
	const categories: [string, string][] = [
	["text-generation", "Text Generation"],
	["speech-recognition", "Automatic Speech Recognition"],
	["translation", "Translation"],
	["text-classification", "Text Classification"],
	["image-classification", "Image Classification"],
	["text-to-image", "Text-to-Image"],
	["text-embeddings", "Text Embeddings"],
	];
	const baseUrl: string = "https://developers.cloudflare.com/workers-ai/models";

	const models: Model[] = [];

	for (const [slug] of categories) {
	const url = `${baseUrl}/${slug}`;
	try {
	const html = await loadHTML(url, baseUrl);
	const categoryModels = extractModels(html);
	models.push(...categoryModels);
	} catch { /* ignore */ }
	}

	console.log("All models fetched:", models);

	return models;
	}

	if (import.meta.main) {
	console.log(await listAllModels());
	}