Last active
February 19, 2025 06:03
-
-
Save andy0130tw/e70d1c063a426edd29ef53213f8b883b to your computer and use it in GitHub Desktop.
Mostly finished
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { blockFunctionNameSet } from './names' | |
interface InputConfig { | |
debug?: boolean | |
locations?: boolean | |
allowServiceRules?: boolean | |
} | |
type IVLanguageVersion = '1.0' | '2.0' | '2.1' | |
export class IVTemplate { | |
constructor( | |
readonly version: IVLanguageVersion, | |
readonly blocks: Block[] | |
) {} | |
} | |
class IVSyntaxError extends Error {} | |
export const ST = Object.freeze({ | |
Option: 'option', | |
Condition: 'condition', | |
Property: 'property', | |
Variable: 'variable', | |
Function: 'function', | |
BlockFunction: 'blockFunction', | |
Include: 'include', | |
// auxiliary types | |
Eof: '_eof', | |
AtRule: '_atRule', // func or blockFunc, based on name | |
ReplaceTagAbbr: '_replaceTagAbbr', | |
BlockFuncStart: '_blockFuncStart', | |
BlockFuncEnd: '_blockFuncEnd', | |
}) | |
type StatementType = typeof ST[keyof typeof ST] | |
type AuxStatementType = StatementType & `_${string}` | |
type ASTNodeType = Exclude<StatementType, AuxStatementType> | |
interface IVDiagnostic { | |
message: string | |
pos: number | |
lineRange: [number, number] | |
} | |
// FIXME: refine for each type | |
export interface IVASTNode<T extends ASTNodeType = ASTNodeType> { | |
type: ASTNodeType & T | |
_source?: string | |
pos?: number | |
lineRange?: [number, number] | |
} | |
interface Block { | |
conditions: IVASTNode<typeof ST.Condition>[] | |
body: IVASTNode[] | |
} | |
export class Input { | |
readonly config: InputConfig | |
type: typeof ST.Eof | ReturnType<typeof inferStatementType> = ST.Eof | |
pos = 0 | |
startPos = 0 | |
startLineNum = 0 | |
endLineNum = 0 | |
content = '' | |
seenFirstStatement: boolean = false | |
languageVersion: IVLanguageVersion | undefined | |
warnings: IVDiagnostic[] = [] | |
constructor( | |
readonly source: string, | |
config: InputConfig = {} | |
) { | |
this.config = { | |
debug: false, | |
locations: true, | |
allowServiceRules: false, | |
...config | |
} | |
this.nextStatement() | |
} | |
get eof() { return this.type == ST.Eof } | |
emitWarning(message: string) { | |
this.warnings.push({ | |
message, | |
pos: this.startPos, | |
lineRange: [this.startLineNum, this.endLineNum], | |
}) | |
} | |
throw(message: string): never { | |
throw new IVSyntaxError(`${formatLineRange(this.startLineNum, this.endLineNum)}: ${message}`) | |
} | |
nextStatement() { | |
const posMax = this.source.length | |
if (this.pos == posMax) { | |
this.type = ST.Eof | |
this.content = '' | |
this.endLineNum = this.startLineNum | |
return | |
} | |
let isLastLine = false | |
let pos = this.pos | |
let content: string | null = null | |
let lineNum = this.endLineNum + 1 | |
let startPos = -1 | |
let startLineNum = -1 | |
while (pos < posMax) { | |
const frags = [] | |
let done = false | |
while (!done) { | |
let lineEnd = this.source.indexOf('\n', pos) | |
if (lineEnd < 0) { | |
isLastLine = true | |
lineEnd = posMax | |
} | |
// find backslash to determine whether to look up the next line | |
const line = this.source.slice(pos, lineEnd).trimStart() | |
const [found, frag] = extractBackslash(line) | |
done = !found | |
if (frag != '') { | |
if (!frags.length) { | |
// a fresh start | |
startLineNum = lineNum | |
startPos = pos | |
} | |
frags.push(frag) | |
} | |
if (isLastLine) { | |
pos = lineEnd | |
done = true | |
} else { | |
// skip past the new line char | |
pos = lineEnd + 1 | |
lineNum++ | |
} | |
} | |
if (frags.length) { | |
const joined = stripTrailingComment(frags.join('')) | |
frags.length = 0 | |
if (joined != '') { | |
content = joined | |
break | |
} | |
} | |
} | |
this.pos = pos | |
this.startPos = startPos | |
this.startLineNum = startLineNum | |
this.endLineNum = isLastLine ? lineNum : lineNum - 1 | |
if (content != null) { | |
this.content = content | |
if (this.type != ST.Eof) { | |
this.seenFirstStatement = true | |
} | |
this.type = inferStatementType(content) | |
} else { | |
// do an extra cycle to set EOF | |
this.nextStatement() | |
} | |
} | |
parse() { | |
const template = parseTemplate(this) | |
return { | |
...template, | |
warnings: this.warnings, | |
} | |
} | |
} | |
function inferStatementType(str: string) { | |
if (str === '}') return ST.BlockFuncEnd | |
switch (str[0]) { | |
case '~': return ST.Option | |
case '?': | |
case '!': | |
return ST.Condition | |
case '@': return ST.AtRule | |
case '<': return ST.ReplaceTagAbbr | |
case '$': return ST.Variable | |
case '+': return ST.Include | |
} | |
return ST.Property | |
} | |
function extractBackslash(str: string): [boolean, string] { | |
// "!": it MUST match something | |
const match = str.match(/(\s*)(\\?)(\s*)$/)! | |
if (match[2]) { | |
return [true, str.slice(0, str.length - match[0].length)] | |
} else { | |
return [false, str.slice(0, str.length - match[1].length)] | |
} | |
} | |
function parseTemplate(input: Input) { | |
const blocks: Block[] = [] | |
let activeBlock: Block = { conditions: [], body: [] } | |
while (!input.eof) { | |
if (input.type === ST.Condition) { | |
if (activeBlock.body.length) { | |
blocks.push(activeBlock) | |
activeBlock = { conditions: [], body: [] } | |
} | |
activeBlock.conditions.push(parseStatement(input) as IVASTNode<typeof ST.Condition>) | |
} else { | |
activeBlock.body.push(parseStatement(input)) | |
} | |
} | |
if (activeBlock.conditions.length || activeBlock.body.length) { | |
blocks.push(activeBlock) | |
} | |
return new IVTemplate(input.languageVersion ?? '1.0', blocks) | |
} | |
function parseStatement(input: Input, parentBlockName?: string): IVASTNode { | |
function doParse() { | |
switch (input.type) { | |
case ST.Option: | |
return parseOption(input) | |
case ST.Condition: | |
if (parentBlockName != null) { | |
const { name } = readRuleParts(input, 1) | |
input.throw(`Conditions are not allowed to appear inside a block func: ${input.content[0]}${name} in @${parentBlockName}`) | |
} | |
return parseCondition(input) | |
case ST.AtRule: | |
return parseAtRule(input) | |
case ST.ReplaceTagAbbr: | |
return parseReplaceTagAbbr(input) | |
case ST.Variable: | |
return parseVariable(input) | |
case ST.Include: | |
if (!input.config.allowServiceRules) { | |
return input.throw('Include rules are not supported in user templates.') | |
} | |
return parseInclude(input) | |
case ST.BlockFuncEnd: | |
if (parentBlockName == null) { | |
input.throw('Unexpected }') | |
} | |
break | |
case ST.Property: return parseProperty(input) | |
} | |
return input.throw(input.eof ? | |
'Expected a statement' : | |
`Unexpected statement type ${input.type}`) | |
} | |
const result = doParse() | |
if (input.languageVersion === undefined) { | |
input.languageVersion = '1.0' | |
} | |
if (!input.seenFirstStatement && input.languageVersion === '1.0') { | |
input.emitWarning('Version 1.0 is outdated. Please specify at least 2.0.') | |
} | |
let meta: Partial<IVASTNode> | undefined | |
if (input.config.debug) { | |
meta = Object.assign(meta ?? {}, { _source: input.content }) | |
} | |
if (input.config.locations) { | |
meta = Object.assign(meta ?? {}, { | |
pos: input.startPos, | |
lineRange: [input.startLineNum, input.endLineNum], | |
}) | |
} | |
// we recurse AFTER the version is resolved | |
// here is the only nested construct | |
if (result.type == ST.BlockFuncStart) { | |
input.nextStatement() | |
let stmts = [] | |
while (input.type != ST.BlockFuncEnd) { | |
stmts.push(parseStatement(input, result.name)) | |
if (input.eof) { | |
input.throw('Expected } but found EOF') | |
} | |
} | |
input.nextStatement() | |
const outer = { | |
type: ST.BlockFunction, | |
name: result.name, | |
query: result.query, | |
body: stmts, | |
} | |
return Object.assign(outer, meta) | |
} | |
input.nextStatement() | |
return Object.assign(result, meta) | |
} | |
function parseOption(input: Input) { | |
const { assertOpeningBrace, name, content } = readRuleParts(input, 1) | |
if (content == null || content == '') | |
input.throw('Option value is required.') | |
if (!name.match(/^[a-zA-Z]\w*$/)) | |
input.throw(`Invalid option name: [${name}]`) | |
assertOpeningBrace(false) | |
if (name === 'version') { | |
const str = readJSONString(content) | |
if (str == null) { | |
input.throw(`Version should be a string: [${content}]`) | |
} | |
let normalized: IVLanguageVersion | |
if (str.match(/^1(\.0?)?$/)) { | |
normalized = '1.0' | |
} else if (str.match(/^2(\.0?)?$/)) { | |
normalized = '2.0' | |
} else if (str === '2.1') { | |
normalized = '2.1' | |
} else { | |
input.throw(`Invalid version: [${str}]`) | |
} | |
// interpret version early because it *might* affect parsing | |
// (but I have not found any in the reference engine) | |
if (input.languageVersion !== undefined) { | |
input.emitWarning('Version can only be specified once at the beginning.') | |
} else { | |
input.languageVersion = normalized | |
} | |
return { type: ST.Option, name: 'version', value: normalized } | |
} else { | |
input.throw(`Unhandled option: [${name}]`) | |
} | |
} | |
function parseCondition(input: Input) { | |
const conditionType = input.content[0] === '?' ? 'OR' : 'AND' | |
const { assertOpeningBrace, name, content } = readRuleParts(input, 1) | |
if (!name.match(/^[a-zA-Z_][-\w\.]*$/)) | |
input.throw(`Invalid condition name: [${name}]`) | |
assertOpeningBrace(false) | |
return { | |
type: ST.Condition, | |
conditionType, | |
name, | |
// depending on condition name, the content is eitherxpath or regexp (but never string) | |
content, | |
} | |
} | |
function parseVariable(input: Input) { | |
const { assertOpeningBrace, name: nameWithSuffix, content } = readRuleParts(input, 1) | |
let name = nameWithSuffix | |
let strategy = { append: false, weak: false } | |
// the order matters! "?+" is allowed but NOT "+?" | |
if (nameWithSuffix.at(-1) == '+') { | |
// name+ means append | |
strategy.append = true | |
name = name.slice(0, -1) | |
} | |
if (nameWithSuffix.at(-1) == '?') { | |
// name? means do not overwrite if non-empty | |
strategy.weak = true | |
name = name.slice(0, -1) | |
} | |
if (!name.match(/^[a-zA-Z]\w*$/)) | |
input.throw(`Invalid variable name: [${name}]`) | |
assertOpeningBrace(false) | |
return { | |
type: ST.Variable, | |
name, | |
strategy: Object.keys(strategy).some(x => x) ? strategy : null, | |
content, | |
} | |
} | |
function parseProperty(input: Input) { | |
const { assertOpeningBrace, name: nameWithSuffix, content } = readRuleParts(input) | |
let name = nameWithSuffix | |
let force: 0 | 1 | 2 = 0 | |
for (let i = 0; i < 2; i++) { | |
if (nameWithSuffix.at(-1) == '!') { | |
force++ | |
name = name.slice(0, -1) | |
} | |
} | |
if (!name.match(/^[a-zA-Z]\w*$/)) | |
input.throw(`Invalid property name: [${name}]`) | |
assertOpeningBrace(false) | |
return { | |
type: ST.Property, | |
name, | |
force, | |
content, | |
} | |
} | |
function parseAtRule(input: Input) { | |
const { assertOpeningBrace, name: nameWithArgs, content } = readRuleParts(input, 1) | |
let name = nameWithArgs | |
let args: IVArgument[] | null = null | |
// XXX: validate function signatures? | |
const lparenPos = findCharToken(nameWithArgs, '(') | |
if (lparenPos >= 0) { | |
if (nameWithArgs.at(-1) == ')') { | |
name = nameWithArgs.slice(0, lparenPos).trimEnd() | |
args = readArguments(nameWithArgs.slice(lparenPos + 1, -1)) | |
} | |
} | |
if (!name.match(/^\w+$/)) | |
input.throw(`Invalid function name: [${name}]`) | |
const isBlockFuncName = blockFunctionNameSet.has(name) | |
assertOpeningBrace(isBlockFuncName) | |
if (isBlockFuncName) { | |
if (args == null || args.length == 0) { | |
input.throw(`@${name} should have 1 argument`) | |
} | |
return { | |
type: ST.BlockFuncStart, | |
name, | |
query: args[0], // excess args are dropped | |
} | |
} | |
return { | |
type: ST.Function, | |
name, | |
content, | |
args, | |
} | |
} | |
function parseReplaceTagAbbr(input: Input) { | |
const tagEndPos = input.content.indexOf('>') | |
if (tagEndPos < 0) { | |
input.throw('> expected but not found') | |
} | |
const tag = input.content.slice(0, tagEndPos + 1) | |
const rest = input.content.slice(tagEndPos + 1) | |
// parse as if the current token were... | |
input.content = `@replace_tag(${tag})${rest}` | |
return parseAtRule(input) | |
} | |
function parseInclude(input: Input) { | |
// naive implementation that should not be used in real cases | |
return { | |
type: ST.Include, | |
content: input.content.slice(1).trimStart(), | |
} | |
} | |
function formatLineRange(a: number, b: number) { | |
return a == b ? `${a}` : `${a}-${b}` | |
} | |
function skipByPattern(str: string, regex: RegExp, pos: number) { | |
if (!regex.sticky) throw new Error('Cannot match without sticky flag set') | |
regex.lastIndex = pos | |
return regex.exec(str) ? regex.lastIndex : -1 | |
} | |
function findStringExtent(str: string, pos: number = 0) { | |
const pat = | |
str[pos] == `"` ? /(?:\\.|[^"\\])*"/y : | |
str[pos] == `'` ? /(?:\\.|[^'\\])*'/y : null | |
if (pat == null) return -1 | |
return skipByPattern(str, pat, pos + 1) | |
} | |
function readJSONString(str: string) { | |
if (str[0] != '"') return null | |
const extent = findStringExtent(str) | |
// assert that the quote char is at the end | |
if (extent != str.length) return null | |
try { | |
// use the built-in JSON parser to do the heavy lifting | |
return JSON.parse(str) as string | |
} catch (err) { | |
if (err instanceof SyntaxError) return null | |
throw err | |
} | |
} | |
function readString(str: string) { | |
// wow this is more convoluted than I have thought | |
const esc = (s: string) => `"${s.slice(1, -1).replace(/"/g, `\\"`).replace(/\\\'/g, `'`)}"` | |
try { | |
const ret = | |
str[0] == `"` ? JSON.parse(str) as string : | |
str[0] == `'` ? JSON.parse(esc(str)) as string : undefined | |
if (ret === undefined) | |
throw new Error('the input does not appear to be a valid string literal.') | |
return ret | |
} catch (err) { | |
if (err instanceof SyntaxError) return null | |
throw err | |
} | |
} | |
export function findCharToken(str: string, chr: string, pos: number = 0) { | |
if (chr.length != 1) throw new Error('Illegal char') | |
// FIXME: is this airtight? | |
if (chr == '\\' || chr == ']') chr = '\\' + chr | |
const pat = new RegExp(`[^'"${chr}]*`, 'y') | |
while (pos < str.length) { | |
let nextPos = skipByPattern(str, pat, pos) | |
if (nextPos == str.length) break | |
// report if the chr (instead of a quote) found | |
if (str[nextPos] == chr) { | |
pos = nextPos | |
break | |
} | |
// skip to where the string literal ends | |
nextPos = findStringExtent(str, nextPos) | |
// if there is no pairing quote, it does not count as a string literal | |
if (nextPos < 0) break | |
pos = nextPos | |
} | |
return str.indexOf(chr, pos) | |
} | |
export function stripTrailingComment(str: string) { | |
const commentStart = findCharToken(str, '#') | |
return commentStart >= 0 ? str.slice(0, commentStart).trimEnd() : str | |
} | |
/** | |
* We rely on the fact that the input and all the output parts are trimmed. | |
*/ | |
export function readRuleParts(input: Input, pos: number = 0): | |
{ assertOpeningBrace: (yes: boolean) => void, name: string, content: string | null } { | |
const str = input.content.slice(pos) | |
const expectTrue = (t: boolean) => t || input.throw('Unexpected {') | |
const expectFalse = (f: boolean) => !f || input.throw('Expected { but found none') | |
// comment: the ref impl is super weird as you can see | |
if (str.at(-1) === '{') { | |
const name = str.slice(0, -1).trimEnd() | |
return { | |
assertOpeningBrace: expectTrue, | |
name, | |
content: null, | |
} | |
} | |
let colonPos = findCharToken(str, ':') | |
if (colonPos < 0) { | |
return { | |
assertOpeningBrace: expectFalse, | |
name: str, | |
content: null, | |
} | |
} | |
return { | |
assertOpeningBrace: expectFalse, | |
name: str.slice(0, colonPos).trimEnd(), | |
content: str.slice(colonPos + 1).trimStart(), | |
} | |
} | |
type IVArgument = string | |
export function readArguments(str: string) { | |
const args: IVArgument[] = [] | |
let pos = 0 | |
let trailingComma = false | |
while (pos < str.length) { | |
const startPos = skipByPattern(str, /\s*/y, pos) | |
let endPos | |
let content: IVArgument | |
let stringExtent = -1 | |
// `"` or `""""` are treated as unquoted strings, WTF | |
if (str[startPos] == `'` || str[startPos] == `"`) { | |
const extent = findStringExtent(str, startPos) | |
if (extent == str.length || (extent >= 0 && str[extent].match(/[\s,]/))) { | |
stringExtent = extent | |
} | |
} | |
if (stringExtent >= 0) { | |
endPos = stringExtent | |
// the official engine treats it as an empty string when the parsing fails, | |
// which leads to some confusing error message like "Regexp pattern is empty" | |
content = readString(str.slice(startPos, endPos)) ?? '' | |
} else { | |
endPos = skipByPattern(str, /[^\s,]*/y, startPos) | |
if (endPos < 0) { | |
endPos = str.length | |
} | |
content = str.slice(startPos, endPos) | |
} | |
args.push(content) | |
// skip to the start of the next argument, consuming a comma if met | |
const nextPos = skipByPattern(str, /\s*/y, endPos) | |
if (nextPos < str.length && str[nextPos] == ',') { | |
trailingComma = true | |
pos = nextPos + 1 | |
} else { | |
trailingComma = false | |
pos = nextPos | |
} | |
} | |
if (trailingComma) { | |
args.push('') | |
} | |
return args | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Input } from './iv-template-line-parser' | |
const rand = (n: number) => Math.floor(Math.random() * n) | |
function referenceParser(str: string) { | |
function trimCont(s: string) { | |
const mat = s.match(/(\s*(?:\\|\\\r))$/) | |
if (mat) { | |
s = s.slice(0, s.length - mat[0].length) | |
} | |
return s | |
} | |
const xs = (str + '\n').split('\n').map(x => x.trim()) | |
const result: string[] = [] | |
for (const x of xs) { | |
if (result.length) { | |
const last = result[result.length - 1] | |
let s = trimCont(last) | |
if (last != s) { | |
result[result.length - 1] = s + x | |
continue | |
} | |
} | |
result.push(x) | |
} | |
return result.filter(x => x) | |
} | |
function fuzz() { | |
const source = Array.from({length: rand(3)}).map(() => | |
' '.repeat(rand(3)) + 'A'.repeat(rand(3)) + ' '.repeat(rand(3)) + | |
['', '\\', '\\\r'][rand(3)] + ' '.repeat(rand(3))).join('\n') + '\n'.repeat(rand(3)) | |
const ref = referenceParser(source) | |
let result: string[] | undefined | |
let err | |
try { | |
result = new Input(source, {debug: true}).parse().blocks.flatMap(blk => [...blk.conditions, ...blk.body]).map(x => x._source!) | |
} catch (e) { | |
err = (e as Error).message | |
} | |
if (result == null) { | |
// when parse fails | |
} else if (ref.length != result.length) { | |
err = 'Array length differs' | |
} else { | |
for (let i = 0; i < ref.length; i++) { | |
if (ref[i] != result[i]) { | |
err = `Incorrect line #${i}` | |
break | |
} | |
} | |
} | |
if (err) { | |
console.log('FAIL --> ' + err) | |
console.log(source.split('\n').map(s => `[${s}]`)) | |
console.log('expected =', ref) | |
console.log('actual =', result) | |
throw 0 | |
} | |
} | |
for (let i = 0; i < 100000; i++) { | |
fuzz() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment