Last active
June 3, 2023 14:50
-
-
Save dcdunkan/38725722806aca8012af485dfa283845 to your computer and use it in GitHub Desktop.
script for validating links in grammyjs/website before building the website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { extname, join } from "https://deno.land/[email protected]/path/mod.ts"; | |
import { | |
DOMParser, | |
HTMLDocument, | |
} from "https://deno.land/x/[email protected]/deno-dom-wasm.ts"; | |
import MarkdownIt from "https://esm.sh/[email protected]"; | |
import anchorPlugin from "https://esm.sh/[email protected]"; | |
import { slugify } from "https://esm.sh/@mdit-vue/[email protected]"; | |
import { | |
blue as b, | |
cyan as c, | |
gray, | |
red as r, | |
} from "https://deno.land/[email protected]/fmt/colors.ts"; | |
type FetchOptions = Parameters<typeof fetch>[1]; | |
const INDEX_FILE = "README.md"; | |
const ALLOW_HTML_INSTEAD_OF_MD = false; | |
const RETRY_FAILED_FETCH = true; | |
const MAX_RETRIES = 5; | |
// some sites just ... ehh | |
const ACCEPTABLE_NOT_OK_STATUS: Record<string, number> = { | |
"https://dash.cloudflare.com/login": 403, | |
"https://dash.cloudflare.com/?account=workers": 403, | |
}; | |
const FETCH_OPTIONS: FetchOptions = { | |
headers: { | |
"User-Agent": | |
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0", | |
"Accept": | |
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | |
"Pragma": "no-cache", | |
"Cache-Control": "no-cache", | |
}, | |
method: "GET", | |
mode: "cors", | |
}; | |
const domParser = new DOMParser(); | |
const md = MarkdownIt({ html: true, linkify: true }) | |
.use(anchorPlugin, { slugify }); // this is what vuepress uses. | |
// All anchors that are actually present in the file or website. | |
const allAnchors: Record<string, Set<string>> = {}; | |
const links: Record<string, Set<string>> = {}; // filepath: [...links in the file] | |
const usedAnchors: Record<string, Record<string, Set<string>>> = {}; // filepath: [...anchors in the file] | |
// the linked file ^ ^ anchors mentioned in that file. | |
// ^ the file that mentioned the anchor | |
interface GeneralIssue { | |
type: | |
| "html_instead_of_md" | |
| "file_not_found" | |
| "not_ok" | |
| "parse_error"; | |
reference: string; | |
} | |
interface RedirectedIssue { | |
type: "redirected"; | |
from: string; | |
to: string; | |
} | |
interface MissingAnchorIssue { | |
type: "missing_anchor"; | |
root: string; | |
anchor: string; | |
} | |
type Issue = GeneralIssue | RedirectedIssue | MissingAnchorIssue; | |
const issues: Record<string, Issue[]> = {}; | |
async function findLinksFromFiles(directory: string) { | |
for await (const dirEntry of Deno.readDir(directory)) { | |
const path = join(directory, dirEntry.name); | |
if (dirEntry.isFile) { | |
if (extname(dirEntry.name).toLowerCase() != ".md") continue; | |
const content = await Deno.readTextFile(path); | |
const tokens = md.parse(content, {}); | |
const html = md.render(content, {}); | |
const document = domParser.parseFromString(html, "text/html"); | |
if (document == null) { | |
throw new Error("Document seems to be empty: shouldn't happen"); | |
} | |
allAnchors[path] = getAnchors(document); | |
const filtered = filterLinks(tokens); | |
for (const link of filtered) { | |
if (link.startsWith("http")) { // external link. | |
links[path] ??= new Set(); | |
links[path].add(link); | |
} else if (link.startsWith(".")) { // relative path to a file. | |
await resolveRelativeLink(directory, path, link); | |
} else if (link.startsWith("#")) { // anchor to the same file. | |
usedAnchors[path] ??= {}; | |
usedAnchors[path][path] ??= new Set(); | |
usedAnchors[path][path].add(link.substring(1)); | |
} else { // some other type -- MUST be an invalid one | |
throw new Error("Different type of link " + link); | |
} | |
} | |
} else if (dirEntry.isDirectory) { | |
await findLinksFromFiles(path); | |
} | |
} | |
} | |
function filterLinks(tokens: ReturnType<typeof md.parse>) { | |
const links: string[] = []; | |
for (const token of tokens) { | |
if (token.type === "link_open") { | |
const href = token.attrGet("href"); | |
if (href != null) links.push(href); | |
} | |
if (token.children != null) { | |
links.push(...filterLinks(token.children)); | |
} | |
} | |
return links; | |
} | |
async function resolveRelativeLink( | |
directory: string, | |
path: string, | |
link: string, | |
) { | |
let [root, anchor] = link.split("#"); | |
if (root.endsWith(".html")) { | |
if (!ALLOW_HTML_INSTEAD_OF_MD) { | |
issues[path] ??= []; | |
issues[path].push({ type: "html_instead_of_md", reference: link }); | |
return; | |
} | |
root = root.replace(".html", ".md"); | |
} | |
if (!root.endsWith(".md")) { | |
if (!root.endsWith("/")) root += "/"; | |
root += INDEX_FILE; | |
} | |
const relativePath = join(directory, root); | |
try { | |
await Deno.lstat(relativePath); | |
if (anchor == null) return; | |
usedAnchors[relativePath] ??= {}; | |
usedAnchors[relativePath][path] ??= new Set(); | |
usedAnchors[relativePath][path].add(anchor); // means that this anchor have been used to indicate the relPath file. | |
} catch (error) { | |
if (error instanceof Deno.errors.NotFound) { | |
issues[path] ??= []; | |
issues[path].push({ type: "file_not_found", reference: link }); | |
return; | |
} | |
throw error; | |
} | |
} | |
function getAnchors(document: HTMLDocument): Set<string> { | |
const anchors: string[] = []; | |
const tags = ["section", "h1", "h2", "h3", "h4", "h5", "h6", "div"]; | |
for (const tag of tags) anchors.push(...anchorsFromId(document, tag)); | |
return new Set([ | |
...document.getElementsByTagName("a") | |
.map((element) => element.getAttribute("href")) | |
.filter((href) => href != null && href.startsWith("#") && href.length > 1) | |
.map((href) => href!.substring(1)), | |
...anchors, | |
]); | |
} | |
function anchorsFromId(document: HTMLDocument, tag: string) { | |
return document.getElementsByTagName(tag) | |
.map((element) => element.getAttribute("id")) | |
.filter((id) => id != null && id.trim() !== "") as string[]; | |
} | |
await findLinksFromFiles("."); | |
/** Transform the URL, if needed, before fetching */ | |
function transformUrl(url: string) { | |
if (url.includes("://t.me/")) { // My ISP have blocked t.me :( | |
warn("Changing t.me to telegram.me for convenience"); | |
url = url.replace("://t.me/", "://telegram.me/"); | |
} | |
return url; | |
} | |
async function retryFetch(url: string, options: FetchOptions) { | |
let retries = 0; | |
let response: Response | undefined; | |
// deno-lint-ignore no-explicit-any | |
let error: any; | |
do { | |
try { | |
response = await fetch(url, options); | |
} catch (err) { | |
error = err; | |
if (!RETRY_FAILED_FETCH) break; | |
log(`%cINFO%c Retrying (${retries + 1})`, "orange"); | |
} | |
retries++; | |
} while (retries < MAX_RETRIES && response == null); | |
if (response == null) { | |
log(`%cFailed%c Couldn't get a proper response`, "red"); | |
console.log(error); | |
} | |
return response; | |
} | |
// Manage external links | |
for (const file in links) { | |
for (const url_ of links[file]) { | |
const [root, anchor] = url_.split("#"); | |
if (usedAnchors[root] == null) { | |
usedAnchors[root] = {}; | |
usedAnchors[root][file] ??= new Set(); | |
if (anchor != null) usedAnchors[root][file].add(anchor); | |
} else { | |
usedAnchors[root][file] ??= new Set(); | |
if (anchor != null) usedAnchors[root][file].add(anchor); | |
continue; // already fetched once. | |
} | |
const url = transformUrl(url_); | |
log(`%cFetching%c ${root}`, "blue"); | |
const response = await retryFetch(url, FETCH_OPTIONS); | |
if (response == null) { | |
delete usedAnchors[root]; | |
continue; | |
} | |
if (response.redirected) { | |
if (!isValidRedirection(url, response.url)) { | |
issues[file] ??= []; | |
issues[file].push({ type: "redirected", from: url_, to: response.url }); | |
} | |
} | |
if (!response.ok && ACCEPTABLE_NOT_OK_STATUS[url_] != response.status) { | |
issues[file] ??= []; | |
issues[file].push({ type: "not_ok", reference: url_ }); | |
log( | |
`%cNOT OK%c response wasn't okay: ${response.status} ${response.statusText}`, | |
"red", | |
); | |
} | |
// for parsing the document we need to make sure its html. | |
const contentType = response.headers.get("content-type"); | |
if (!contentType) { | |
warn(`%cWARN%c No content-type header, continuing anyway`); | |
} else if (!contentType.includes("text/html")) { | |
warn(`Content-type is: ${contentType}, but let's just go with html`); | |
} | |
let document: HTMLDocument; | |
try { | |
const content = await response.text(); | |
const doc = domParser.parseFromString(content, "text/html"); | |
if (doc == null) throw new Error("no document, skipping"); | |
document = doc; | |
} catch (err) { | |
issues[file] ??= []; | |
issues[file].push({ type: "parse_error", reference: url_ }); | |
log("%cERROR%c Couldn't parse the text (error below), skipping", "red"); | |
console.log(err); | |
continue; | |
} | |
allAnchors[root] = getAnchors(document); | |
} | |
} | |
// Missing anchors | |
for (const root in usedAnchors) { | |
const all = allAnchors[root] ?? new Set(); | |
for (const file in usedAnchors[root]) { | |
for (const anchor of usedAnchors[root][file]) { | |
const decodedAnchor = decodeURIComponent(anchor); // there are other langs | |
if ( | |
all.has(decodedAnchor) || | |
isValidAnchor(root, all, decodedAnchor) | |
) continue; | |
issues[file] ??= []; | |
issues[file].push({ | |
type: "missing_anchor", | |
root, | |
anchor: decodedAnchor, | |
}); | |
} | |
} | |
} | |
/** Some redirections are okay, so we ignore those changes */ | |
function isValidRedirection(from: string, to: string) { | |
return ( | |
( | |
// CASE 1: | |
from.includes("deno.land/x/") && // a third-party module | |
!from.includes("@") && // supposed to be redirected to the latest version | |
to.includes("@") // and it does get redirected | |
) || | |
( | |
// CASE 2: | |
from.includes("deno.com/manual/") && // deno manual link: supposed to be redirected to the latest | |
to.includes("@") // and does get redirected to the latest. | |
) || | |
// CASE 3: short youtu.be links redirecting to youtube.com links. | |
to.includes(from.replace(new URL(from).origin + "/", "?v=")) || | |
// CASE 4: maybe a slash was removed or added --> I don't think we should care. | |
((to + "/" == from) || (from + "/" == to)) || | |
// CASE 5: maybe some search params was appended --> like a language code? | |
to.includes(from + "?") || | |
// CASE 6: Login redirections; e.g., firebase console -> google login | |
((to.includes("accounts.google.com") && to.includes("signin")) || // Google | |
(to.includes("github.com/login?return_to="))) // Github | |
); | |
} | |
/* Some anchors might be missing due to how the content is loaded in the website */ | |
function isValidAnchor(root: string, all: Set<string>, anchor: string) { | |
// firebase (generally google) docs sometimes messes up the response | |
// from the fetch as the contents are lazy loaded. the following is a hack: | |
if (root.includes("firebase.google.com/docs")) { | |
return all.has(anchor + "_1"); | |
} | |
return false; | |
} | |
function warn(text: string) { | |
console.warn(`%cWARN%c ${text}`, "color: yellow", "color: none"); | |
} | |
function log(text: string, color: string) { | |
console.log(text, `color: ${color}`, "color: none"); | |
} | |
// REPORT | |
const issueCounts = { | |
total: 0, | |
missing_anchor: 0, | |
html_instead_of_md: 0, | |
file_not_found: 0, | |
redirected: 0, | |
not_ok: 0, | |
parse_error: 0, | |
}; | |
const sortedFiles = Object.keys(issues).sort((a, b) => a.localeCompare(b)); | |
const d = decodeURIComponent; | |
for (const file of sortedFiles) { | |
const issueList = issues[file]; | |
issueCounts.total += issueList.length; | |
let report = r(`\n${file} (${issueList.length})`); | |
for (const issue of issueList) { | |
issueCounts[issue.type]++; | |
report += `\n ${gray("-->")} `; | |
// deno-fmt-ignore | |
switch (issue.type) { | |
case "missing_anchor": | |
report += `${c(issue.root)} does not have an anchor ${b(d(issue.anchor))}.`; | |
break; | |
case "html_instead_of_md": { | |
const [root, anchor] = issue.reference.split("#"); | |
report += `The "${b(root)}" in ${b(`${root}#${d(anchor)}`)} should be ending with ".md" instead of ".html".`; | |
break; | |
} | |
case "file_not_found": { | |
const [root] = issue.reference.split("#"); | |
report += `The linked file ${c(root)} does not exist.`; | |
break; | |
} | |
case "redirected": | |
report += `${c(issue.from)} was redirected to ${c(issue.to)}.`; | |
break; | |
case "not_ok": | |
report += `${c(issue.reference)} returned a non-ok status code.`; | |
break; | |
case "parse_error": | |
report += `Couldn't parse the document at ${b(issue.reference)}.` | |
break; | |
} | |
} | |
console.log(report); | |
} | |
const maxDistance = issueCounts.total.toString().length; | |
function pad(x: number) { | |
return x.toString().padStart(maxDistance, " "); | |
} | |
console.log(` | |
SUMMARY | |
--------------------------${"-".repeat(maxDistance)} | |
Missing anchors : ${pad(issueCounts.missing_anchor)} | |
Used html instead of md : ${pad(issueCounts.html_instead_of_md)} | |
Links to missing files : ${pad(issueCounts.file_not_found)} | |
Redirected : ${pad(issueCounts.redirected)} | |
Not OK response : ${pad(issueCounts.not_ok)} | |
DOM parsing failed : ${pad(issueCounts.parse_error)} | |
--------------------------${"-".repeat(maxDistance)} | |
Total : ${issueCounts.total}`); | |
if (issueCounts.total > 0) { | |
Deno.exit(1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment