Last active
October 24, 2023 21:04
-
-
Save kevinswiber/41af627ddd7e9cf37bd1ef53eeef2e38 to your computer and use it in GitHub Desktop.
Implementation of URL Pattern spec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import assert from "node:assert"; | |
function escapeRegexString(str) { | |
return str.replace(/[\.\+\*\?\^\$\{\}\(\)\[\]\|\/\\]/g, "\\$&"); | |
} | |
function isValidNameCodePoint(codePoint, isFirstCodePoint) { | |
if (isFirstCodePoint) { | |
return (codePoint >= 65 && codePoint <= 90) || | |
(codePoint >= 97 && codePoint <= 122) || | |
codePoint === 95; | |
} | |
return (codePoint >= 65 && codePoint <= 90) || | |
(codePoint >= 97 && codePoint <= 122) || | |
(codePoint >= 48 && codePoint <= 57) || | |
codePoint === 95; | |
} | |
function isASCII(codePoint) { | |
return codePoint <= 0x7f; | |
} | |
function tokenize(input, policy = "strict") { | |
const tokenizer = { | |
input, | |
policy, | |
index: 0, | |
nextIndex: 0, | |
codePoint: null, | |
inputCodePointLength: [...input].length, | |
tokens: [] | |
}; | |
while (tokenizer.index < tokenizer.inputCodePointLength) { | |
tokenizer.nextIndex = tokenizer.index; | |
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex); | |
tokenizer.nextIndex++; | |
switch (tokenizer.codePoint) { | |
case 0x2a /* "*" */: | |
tokenizer.tokens.push({ | |
type: "asterisk", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(tokenizer.index, | |
tokenizer.nextIndex - tokenizer.index) | |
}); | |
tokenizer.index++; | |
continue; | |
case 0x2b /* "+" */: | |
case 0x3f /* "?" */: | |
tokenizer.tokens.push({ | |
type: "other-modifier", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(tokenizer.index, | |
tokenizer.nextIndex - tokenizer.index) | |
}); | |
tokenizer.index++; | |
continue; | |
case 0x5c /* "\" */: | |
if (tokenizer.index === tokenizer.inputCodePointLength - 1) { | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid escape sequence."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(tokenizer.index, | |
tokenizer.nextIndex - tokenizer.index) | |
}); | |
tokenizer.index++; | |
continue; | |
} | |
const escapedIndex = tokenizer.index; | |
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex); | |
tokenizer.nextIndex++; | |
tokenizer.tokens.push({ | |
type: "escaped-char", | |
index: escapedIndex, | |
value: tokenizer.input.substr(escapedIndex, | |
tokenizer.nextIndex - escapedIndex) | |
}); | |
tokenizer.index++; | |
continue; | |
case 0x7b /* "{" */: | |
tokenizer.tokens.push({ | |
type: "open", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(tokenizer.index, | |
tokenizer.nextIndex - tokenizer.index) | |
}); | |
tokenizer.index++; | |
continue; | |
case 0x7d /* "}" */: | |
tokenizer.tokens.push({ | |
type: "close", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(tokenizer.index, | |
tokenizer.nextIndex - tokenizer.index) | |
}); | |
tokenizer.index++; | |
continue; | |
case 0x3a /* ":" */: | |
let namePosition = tokenizer.nextIndex; | |
const nameStart = namePosition; | |
while (namePosition < tokenizer.inputCodePointLength) { | |
tokenizer.nextIndex = namePosition; | |
tokenizer.codePoint = tokenizer.input.codePointAt(namePosition); | |
tokenizer.nextIndex++; | |
const isFirstCodePoint = namePosition === nameStart; | |
const isValidCodePoint = isValidNameCodePoint(tokenizer.codePoint, isFirstCodePoint); | |
if (!isValidCodePoint) { | |
break; | |
} | |
namePosition = tokenizer.nextIndex; | |
} | |
if (namePosition <= nameStart) { | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid name."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: nameStart, | |
value: tokenizer.input.substr(tokenizer.nameStart, | |
tokenizer.index - nameStart) | |
}); | |
tokenizer.index = nameStart; | |
continue; | |
} | |
tokenizer.tokens.push({ | |
type: "name", | |
index: nameStart, | |
value: tokenizer.input.substr(nameStart, | |
namePosition - nameStart) | |
}); | |
tokenizer.index = namePosition; | |
continue; | |
case 0x28 /* "(" */: | |
let depth = 1; | |
let regexpPosition = tokenizer.nextIndex; | |
let error = false; | |
const regexpStart = regexpPosition; | |
while (regexpPosition < tokenizer.inputCodePointLength) { | |
tokenizer.nextIndex = regexpPosition; | |
tokenizer.codePoint = tokenizer.input.codePointAt(regexpPosition); | |
tokenizer.nextIndex++; | |
if (!isASCII(tokenizer.codePoint)) { | |
error = true; | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid character."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: regexpStart, | |
value: tokenizer.input.substr(tokenizer.nameStart, | |
tokenizer.index - regexpStart) | |
}); | |
tokenizer.index = regexpStart; | |
break; | |
} | |
if (regexpPosition === regexpStart && tokenizer.codePoint === 0x3f /* "?" */) { | |
error = true; | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid character."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: regexpStart, | |
value: tokenizer.input.substr(tokenizer.nameStart, | |
tokenizer.index - regexpStart) | |
}); | |
tokenizer.index = regexpStart; | |
break; | |
} | |
if (tokenizer.codePoint === 0x5c /* "\" */) { | |
if (regexpPosition === tokenizer.inputCodePointLength - 1) { | |
error = true; | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid escape sequence."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(tokenizer.index, | |
tokenizer.nextIndex - tokenizer.index) | |
}); | |
tokenizer.index++; | |
break; | |
} | |
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex); | |
tokenizer.nextIndex++; | |
if (!isASCII(tokenizer.codePoint)) { | |
error = true; | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid character."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: regexpStart, | |
value: tokenizer.input.substr(tokenizer.nameStart, | |
tokenizer.index - regexpStart) | |
}); | |
tokenizer.index = regexpStart; | |
break; | |
} | |
regexpPosition = tokenizer.nextIndex; | |
continue; | |
} | |
if (tokenizer.codePoint === 0x29 /* ")" */) { | |
depth--; | |
if (depth === 0) { | |
regexpPosition = tokenizer.nextIndex; | |
break; | |
} | |
} else if (tokenizer.codePoint === 0x28 /* "(" */) { | |
depth++; | |
if (regexpPosition === tokenizer.inputCodePointLength - 1) { | |
error = true; | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid char."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: regexpStart, | |
value: tokenizer.input.substr(tokenizer.index, | |
regexpStart - tokenizer.index) | |
}); | |
tokenizer.index = regexpStart; | |
break; | |
} | |
const tempPosition = tokenizer.nextIndex; | |
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex); | |
tokenizer.nextIndex++; | |
if (tokenizer.codePoint !== 0x3f /* "?" */) { | |
error = true; | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid char."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: regexpStart, | |
value: tokenizer.input.substr(tokenizer.index, | |
regexpStart - tokenizer.index) | |
}); | |
tokenizer.index = regexpStart; | |
break; | |
} | |
tokenizer.nextIndex = tempPosition; | |
} | |
regexpPosition = tokenizer.nextIndex; | |
} | |
if (error) { | |
continue; | |
} | |
if (depth !== 0) { | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid char."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: regexpStart, | |
value: tokenizer.input.substr(tokenizer.index, | |
regexpStart - tokenizer.index) | |
}); | |
tokenizer.index = regexpStart; | |
continue; | |
} | |
const regexpLen = regexpPosition - regexpStart - 1; | |
if (regexpLen === 0) { | |
if (tokenizer.policy === "strict") { | |
throw new TypeError("Invalid char."); | |
} | |
assert(tokenizer.policy === "lenient"); | |
tokenizer.tokens.push({ | |
type: "invalid-char", | |
index: regexpStart, | |
value: tokenizer.input.substr(tokenizer.index, | |
regexpStart - tokenizer.index) | |
}); | |
tokenizer.index = regexpStart; | |
continue; | |
} | |
tokenizer.tokens.push({ | |
type: "regexp", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(regexpStart, regexpLen) | |
}); | |
tokenizer.index = regexpPosition; | |
continue; | |
default: | |
tokenizer.tokens.push({ | |
type: "char", | |
index: tokenizer.index, | |
value: tokenizer.input.substr(tokenizer.index, | |
tokenizer.nextIndex - tokenizer.index) | |
}); | |
tokenizer.index++; | |
continue; | |
} | |
} | |
tokenizer.tokens.push({ | |
type: "end", | |
index: tokenizer.index, | |
value: "" | |
}); | |
return tokenizer.tokens; | |
} | |
function parse(input, | |
options = { | |
delimiterCodePoint: "/", | |
prefixCodePoint: "/", | |
}, encodingCallback = (str) => str) { | |
const { delimiterCodePoint, prefixCodePoint } = options; | |
const segmentWildcardRegexp = "[^" + escapeRegexString(delimiterCodePoint) + "]+?" | |
const parser = { | |
tokens: tokenize(input), | |
segmentWildcardRegexp, | |
parts: [], | |
pendingFixedValue: "", | |
index: 0, | |
nextNumericName: 0, | |
}; | |
function tryToConsume(tokenType) { | |
const current = parser.tokens[parser.index]; | |
if (current.type === tokenType) { | |
parser.index++; | |
return current; | |
} | |
return null; | |
} | |
function tryToConsumeRegexpOrWildcard(nameToken) { | |
const token = tryToConsume("regexp"); | |
return (nameToken === null && token === null) | |
? tryToConsume("asterisk") : token; | |
} | |
function tryToConsumeModifierToken() { | |
const token = tryToConsume("other-modifier"); | |
return token === null ? tryToConsume("asterisk") : token; | |
} | |
function maybeAddAPartFromPendingFixedValue() { | |
if (parser.pendingFixedValue === "") { | |
return; | |
} | |
const encodedValue = encodingCallback(parser.pendingFixedValue); | |
parser.pendingFixedValue = ""; | |
const part = { | |
type: "fixed-text", | |
value: encodedValue, | |
modifier: "none" | |
}; | |
parser.parts.push(part); | |
} | |
function addAPart(prefix, nameToken, regexpOrWildcardToken, suffix, modifierToken) { | |
let modifier = "none"; | |
if (modifierToken !== null) { | |
switch (modifierToken.value) { | |
case "?": | |
modifier = "optional"; | |
break; | |
case "*": | |
modifier = "zero-or-more"; | |
break; | |
case "+": | |
modifier = "one-or-more"; | |
break; | |
} | |
} | |
if (nameToken === null && regexpOrWildcardToken === null && modifier === "none") { | |
parser.pendingFixedValue += prefix; | |
return; | |
} | |
maybeAddAPartFromPendingFixedValue(); | |
if (nameToken === null && regexpOrWildcardToken === null) { | |
assert(suffix === ""); | |
if (prefix === "") { | |
return; | |
} | |
const encodedValue = encodingCallback(prefix); | |
const part = { | |
type: "fixed-text", | |
value: encodedValue, | |
modifier | |
}; | |
parser.parts.push(part); | |
return; | |
} | |
let regexpValue = ""; | |
if (regexpOrWildcardToken === null) { | |
regexpValue = parser.segmentWildcardRegexp; | |
} else if (regexpOrWildcardToken.type === "asterisk") { | |
regexpValue = ".*"; | |
} else { | |
regexpValue = regexpOrWildcardToken.value; | |
} | |
let type = "regexp"; | |
if (regexpValue === parser.segmentWildcardRegexp) { | |
type = "segment-wildcard"; | |
regexpValue = ""; | |
} else if (regexpValue === ".*") { | |
type = "full-wildcard"; | |
regexpValue = ""; | |
} | |
let name = ""; | |
if (nameToken !== null) { | |
name = nameToken.value; | |
} else if (regexpOrWildcardToken !== null) { | |
name = parser.nextNumericName.toString(); | |
parser.nextNumericName++; | |
} | |
if (parser.parts.find((part) => part.name === name)) { | |
throw new TypeError("duplicate name"); | |
} | |
const encodedPrefix = encodingCallback(prefix); | |
const encodedSuffix = encodingCallback(suffix); | |
const part = { | |
type, | |
value: regexpValue, | |
modifier, | |
name, | |
prefix: encodedPrefix, | |
suffix: encodedSuffix | |
}; | |
parser.parts.push(part); | |
} | |
function consumeText() { | |
let result = ""; | |
while (parser.index < parser.tokens.length) { | |
let token = tryToConsume("char"); | |
if (token === null) { | |
token = tryToConsume("escaped-char"); | |
} | |
if (token === null) { | |
break; | |
} | |
result += token.value; | |
} | |
return result; | |
} | |
while (parser.index < parser.tokens.length) { | |
const charToken = tryToConsume("char"); | |
const nameToken = tryToConsume("name"); | |
const regexpOrWildcardToken = tryToConsumeRegexpOrWildcard(nameToken); | |
if (nameToken !== null || regexpOrWildcardToken !== null) { | |
let prefix = charToken ? charToken.value : ""; | |
if (!["", prefixCodePoint].includes("prefix")) { | |
parser.pendingFixedValue += prefix; | |
prefix = ""; | |
} | |
maybeAddAPartFromPendingFixedValue(); | |
let modifierToken = tryToConsumeModifierToken(); | |
addAPart(prefix, nameToken, regexpOrWildcardToken, "", modifierToken); | |
continue; | |
} | |
let fixedToken = charToken; | |
if (fixedToken === null) { | |
fixedToken = tryToConsume("escaped-char"); | |
} | |
if (fixedToken !== null) { | |
parser.pendingFixedValue += fixedToken.value; | |
continue; | |
} | |
let openToken = tryToConsume("open"); | |
if (openToken !== null) { | |
const prefix = consumeText(); | |
const nameToken = tryToConsume("name"); | |
const regexpOrWildcardToken = tryToConsumeRegexpOrWildcard(nameToken); | |
const suffix = consumeText(); | |
const closeToken = tryToConsume("close"); | |
if (closeToken === null) { | |
throw new TypeError("missing close token"); | |
} | |
const modifierToken = tryToConsumeModifierToken(); | |
addAPart(prefix, nameToken, regexpOrWildcardToken, suffix, modifierToken); | |
continue; | |
} | |
maybeAddAPartFromPendingFixedValue(); | |
const endToken = tryToConsume("end"); | |
if (endToken === null) { | |
throw new TypeError("expected end token"); | |
} | |
} | |
return parser.parts; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment