Last active
February 27, 2024 10:35
-
-
Save nberlette/83bdb713660586a3fb2ce8a3e50e20fa to your computer and use it in GitHub Desktop.
WorkersAI: utility to extract LLM model data from Cloudflare Workers AI docs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*! | |
* Run this script with Deno to extract all models from the Workers AI | |
* documentation site (https://developers.cloudflare.com/workers-ai/models). | |
* | |
* @example | |
* ```sh | |
* # install Deno if you do not have it already | |
* curl -fsSL https://deno.land/install.sh | sh - | |
* | |
* # run the script | |
* deno run --allow-net ./extract-workers-ai-models.ts | |
* | |
* # example output: | |
* # [ | |
* # { | |
* # name: "@hf/thebloke/llama-2-13b-chat-awq", | |
* # urls: [ | |
* # [ | |
* # "https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ", | |
* # "More information" | |
* # ] | |
* # ], | |
* # info: "Llama 2 13B Chat AWQ is an efficient, accurate"... 117 more characters, | |
* # data: { | |
* # default_max_sequence_tokens_stream: 512, | |
* # default_max_sequence_tokens: 256 | |
* # } | |
* # }, | |
* # ... | |
* # ] | |
* ``` | |
*/ | |
import { | |
Document, | |
DOMParser, | |
Element, | |
} from "https://deno.land/x/[email protected]/deno-dom-wasm.ts"; | |
export interface Model { | |
readonly name: string; | |
readonly info: string; | |
readonly urls: (readonly [url: string | URL, text: string])[]; | |
readonly data: Record<string, number>; | |
} | |
/** | |
* Fetches the HTML content for a given URL. | |
* @param url The URL to fetch. | |
* @returns The HTML content of the fetched URL. | |
*/ | |
async function loadHTML(url: string, referrer: string): Promise<string> { | |
const response = await fetch(url, { | |
headers: { | |
"User-Agent": | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", | |
}, | |
method: "GET", | |
mode: "cors", | |
redirect: "follow", | |
referrer, | |
}); | |
if (!response.ok) { | |
throw new Error(`Failed to fetch ${url}: ${response.statusText}`); | |
} | |
return await response.text(); | |
} | |
const parser = new DOMParser(), { parseFromString: parseDOM } = parser; | |
const parseHTML = (html: string, type = "text/html") => parseDOM(html, type)!; | |
async function loadDocument(url: string | URL, referrer = url): Promise<Document> { | |
const html = await loadHTML(url.toString(), referrer.toString()); | |
return parser.parseFromString(html, "text/html")!; | |
} | |
/** | |
* Extracts model information from a given HTML document. | |
* | |
* @param html The document HTML from which to extract models. | |
* @returns The list of models extracted from the document. | |
*/ | |
function extractModels(html: string): Model[] { | |
const document = parser.parseFromString(html, "text/html")!; | |
return Array.from( | |
document.querySelectorAll( | |
`#main > article > table > tbody > tr > td > code`, | |
), | |
).map((c) => { | |
const name = c.textCondtent!; | |
let rhs = c.parentNode!.nextSibling as Element; | |
while (rhs.nodeName ) | |
const rhsLinks = rhs.querySelectorAll("a[href]"); | |
const rhsInnerText = rhs.innerText!; | |
let rhsText = rhs.childNodes[0].innerText || rhsInnerText; | |
const dataRegExp = (/(?<=[a-z.])(?:Default max|Max |languages: ((?:(\w+)(?:, |))+))/g); | |
const uselessRegExp = /(?:Open external link|External link icon)\n?/g; | |
const usefulRegExp = /(More information|Terms and license)\n?/g; | |
const urlRegExp = /https?:\/\/\S+/g; | |
rhsText = rhsText.trim().replaceAll(uselessRegExp, ""); | |
let rhsHTML = rhs.innerHTML; | |
const links = Array.from( | |
rhsLinks.length ? rhsLinks : | |
rhsText.matchAll(urlRegExp) ?? [], | |
).flat(2).map((a) => a as Element); | |
const urls = Array.from<Element>(links).map((a) => | |
[ | |
a.getAttribute("href")!, | |
a.innerText.match(usefulRegExp)?.[0] ?? "More information", | |
] as [string, string] | |
); | |
const info = rhs.innerText!.replaceAll( | |
/(More infomation)\s*/mg, | |
`[$1](${urls[0]?.[0] ?? "#"})`, | |
); | |
const data = Array.from( | |
String(rhs.innerHTML).matchAll(/<strong>(.+?)<\/strong>:\s*(\d+)/g) ?? [], | |
).reduce((acc, [, key, value]) => { | |
const normalizedKey = key.toLowerCase().replace(/[^a-z0-9_]+/g, "_") | |
.replace(/^_|_$|(?<=_)_+/g, ""); | |
acc[normalizedKey] = parseInt(value, 10); | |
return acc; | |
}, {} as Record<string, number>); | |
return { name, urls, info, data }; | |
}); | |
} | |
/** | |
* Fetches and parses models from each category. | |
*/ | |
export async function listAllModels(): Promise<Model[]> { | |
const categories: [string, string][] = [ | |
["text-generation", "Text Generation"], | |
["speech-recognition", "Automatic Speech Recognition"], | |
["translation", "Translation"], | |
["text-classification", "Text Classification"], | |
["image-classification", "Image Classification"], | |
["text-to-image", "Text-to-Image"], | |
["text-embeddings", "Text Embeddings"], | |
]; | |
const baseUrl: string = "https://developers.cloudflare.com/workers-ai/models"; | |
const models: Model[] = []; | |
for (const [slug] of categories) { | |
const url = `${baseUrl}/${slug}`; | |
try { | |
const html = await loadHTML(url, baseUrl); | |
const categoryModels = extractModels(html); | |
models.push(...categoryModels); | |
} catch { /* ignore */ } | |
} | |
console.log("All models fetched:", models); | |
return models; | |
} | |
if (import.meta.main) { | |
console.log(await listAllModels()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment