Skip to content

Instantly share code, notes, and snippets.

@andy0130tw
Last active February 19, 2025 06:03
Show Gist options
  • Save andy0130tw/e70d1c063a426edd29ef53213f8b883b to your computer and use it in GitHub Desktop.
Save andy0130tw/e70d1c063a426edd29ef53213f8b883b to your computer and use it in GitHub Desktop.
Mostly finished
import { blockFunctionNameSet } from './names'
interface InputConfig {
debug?: boolean
locations?: boolean
allowServiceRules?: boolean
}
type IVLanguageVersion = '1.0' | '2.0' | '2.1'
export class IVTemplate {
constructor(
readonly version: IVLanguageVersion,
readonly blocks: Block[]
) {}
}
class IVSyntaxError extends Error {}
export const ST = Object.freeze({
Option: 'option',
Condition: 'condition',
Property: 'property',
Variable: 'variable',
Function: 'function',
BlockFunction: 'blockFunction',
Include: 'include',
// auxiliary types
Eof: '_eof',
AtRule: '_atRule', // func or blockFunc, based on name
ReplaceTagAbbr: '_replaceTagAbbr',
BlockFuncStart: '_blockFuncStart',
BlockFuncEnd: '_blockFuncEnd',
})
type StatementType = typeof ST[keyof typeof ST]
type AuxStatementType = StatementType & `_${string}`
type ASTNodeType = Exclude<StatementType, AuxStatementType>
interface IVDiagnostic {
message: string
pos: number
lineRange: [number, number]
}
// FIXME: refine for each type
export interface IVASTNode<T extends ASTNodeType = ASTNodeType> {
type: ASTNodeType & T
_source?: string
pos?: number
lineRange?: [number, number]
}
interface Block {
conditions: IVASTNode<typeof ST.Condition>[]
body: IVASTNode[]
}
export class Input {
readonly config: InputConfig
type: typeof ST.Eof | ReturnType<typeof inferStatementType> = ST.Eof
pos = 0
startPos = 0
startLineNum = 0
endLineNum = 0
content = ''
seenFirstStatement: boolean = false
languageVersion: IVLanguageVersion | undefined
warnings: IVDiagnostic[] = []
constructor(
readonly source: string,
config: InputConfig = {}
) {
this.config = {
debug: false,
locations: true,
allowServiceRules: false,
...config
}
this.nextStatement()
}
get eof() { return this.type == ST.Eof }
emitWarning(message: string) {
this.warnings.push({
message,
pos: this.startPos,
lineRange: [this.startLineNum, this.endLineNum],
})
}
throw(message: string): never {
throw new IVSyntaxError(`${formatLineRange(this.startLineNum, this.endLineNum)}: ${message}`)
}
nextStatement() {
const posMax = this.source.length
if (this.pos == posMax) {
this.type = ST.Eof
this.content = ''
this.endLineNum = this.startLineNum
return
}
let isLastLine = false
let pos = this.pos
let content: string | null = null
let lineNum = this.endLineNum + 1
let startPos = -1
let startLineNum = -1
while (pos < posMax) {
const frags = []
let done = false
while (!done) {
let lineEnd = this.source.indexOf('\n', pos)
if (lineEnd < 0) {
isLastLine = true
lineEnd = posMax
}
// find backslash to determine whether to look up the next line
const line = this.source.slice(pos, lineEnd).trimStart()
const [found, frag] = extractBackslash(line)
done = !found
if (frag != '') {
if (!frags.length) {
// a fresh start
startLineNum = lineNum
startPos = pos
}
frags.push(frag)
}
if (isLastLine) {
pos = lineEnd
done = true
} else {
// skip past the new line char
pos = lineEnd + 1
lineNum++
}
}
if (frags.length) {
const joined = stripTrailingComment(frags.join(''))
frags.length = 0
if (joined != '') {
content = joined
break
}
}
}
this.pos = pos
this.startPos = startPos
this.startLineNum = startLineNum
this.endLineNum = isLastLine ? lineNum : lineNum - 1
if (content != null) {
this.content = content
if (this.type != ST.Eof) {
this.seenFirstStatement = true
}
this.type = inferStatementType(content)
} else {
// do an extra cycle to set EOF
this.nextStatement()
}
}
parse() {
const template = parseTemplate(this)
return {
...template,
warnings: this.warnings,
}
}
}
function inferStatementType(str: string) {
if (str === '}') return ST.BlockFuncEnd
switch (str[0]) {
case '~': return ST.Option
case '?':
case '!':
return ST.Condition
case '@': return ST.AtRule
case '<': return ST.ReplaceTagAbbr
case '$': return ST.Variable
case '+': return ST.Include
}
return ST.Property
}
function extractBackslash(str: string): [boolean, string] {
// "!": it MUST match something
const match = str.match(/(\s*)(\\?)(\s*)$/)!
if (match[2]) {
return [true, str.slice(0, str.length - match[0].length)]
} else {
return [false, str.slice(0, str.length - match[1].length)]
}
}
function parseTemplate(input: Input) {
const blocks: Block[] = []
let activeBlock: Block = { conditions: [], body: [] }
while (!input.eof) {
if (input.type === ST.Condition) {
if (activeBlock.body.length) {
blocks.push(activeBlock)
activeBlock = { conditions: [], body: [] }
}
activeBlock.conditions.push(parseStatement(input) as IVASTNode<typeof ST.Condition>)
} else {
activeBlock.body.push(parseStatement(input))
}
}
if (activeBlock.conditions.length || activeBlock.body.length) {
blocks.push(activeBlock)
}
return new IVTemplate(input.languageVersion ?? '1.0', blocks)
}
function parseStatement(input: Input, parentBlockName?: string): IVASTNode {
function doParse() {
switch (input.type) {
case ST.Option:
return parseOption(input)
case ST.Condition:
if (parentBlockName != null) {
const { name } = readRuleParts(input, 1)
input.throw(`Conditions are not allowed to appear inside a block func: ${input.content[0]}${name} in @${parentBlockName}`)
}
return parseCondition(input)
case ST.AtRule:
return parseAtRule(input)
case ST.ReplaceTagAbbr:
return parseReplaceTagAbbr(input)
case ST.Variable:
return parseVariable(input)
case ST.Include:
if (!input.config.allowServiceRules) {
return input.throw('Include rules are not supported in user templates.')
}
return parseInclude(input)
case ST.BlockFuncEnd:
if (parentBlockName == null) {
input.throw('Unexpected }')
}
break
case ST.Property: return parseProperty(input)
}
return input.throw(input.eof ?
'Expected a statement' :
`Unexpected statement type ${input.type}`)
}
const result = doParse()
if (input.languageVersion === undefined) {
input.languageVersion = '1.0'
}
if (!input.seenFirstStatement && input.languageVersion === '1.0') {
input.emitWarning('Version 1.0 is outdated. Please specify at least 2.0.')
}
let meta: Partial<IVASTNode> | undefined
if (input.config.debug) {
meta = Object.assign(meta ?? {}, { _source: input.content })
}
if (input.config.locations) {
meta = Object.assign(meta ?? {}, {
pos: input.startPos,
lineRange: [input.startLineNum, input.endLineNum],
})
}
// we recurse AFTER the version is resolved
// here is the only nested construct
if (result.type == ST.BlockFuncStart) {
input.nextStatement()
let stmts = []
while (input.type != ST.BlockFuncEnd) {
stmts.push(parseStatement(input, result.name))
if (input.eof) {
input.throw('Expected } but found EOF')
}
}
input.nextStatement()
const outer = {
type: ST.BlockFunction,
name: result.name,
query: result.query,
body: stmts,
}
return Object.assign(outer, meta)
}
input.nextStatement()
return Object.assign(result, meta)
}
function parseOption(input: Input) {
const { assertOpeningBrace, name, content } = readRuleParts(input, 1)
if (content == null || content == '')
input.throw('Option value is required.')
if (!name.match(/^[a-zA-Z]\w*$/))
input.throw(`Invalid option name: [${name}]`)
assertOpeningBrace(false)
if (name === 'version') {
const str = readJSONString(content)
if (str == null) {
input.throw(`Version should be a string: [${content}]`)
}
let normalized: IVLanguageVersion
if (str.match(/^1(\.0?)?$/)) {
normalized = '1.0'
} else if (str.match(/^2(\.0?)?$/)) {
normalized = '2.0'
} else if (str === '2.1') {
normalized = '2.1'
} else {
input.throw(`Invalid version: [${str}]`)
}
// interpret version early because it *might* affect parsing
// (but I have not found any in the reference engine)
if (input.languageVersion !== undefined) {
input.emitWarning('Version can only be specified once at the beginning.')
} else {
input.languageVersion = normalized
}
return { type: ST.Option, name: 'version', value: normalized }
} else {
input.throw(`Unhandled option: [${name}]`)
}
}
function parseCondition(input: Input) {
const conditionType = input.content[0] === '?' ? 'OR' : 'AND'
const { assertOpeningBrace, name, content } = readRuleParts(input, 1)
if (!name.match(/^[a-zA-Z_][-\w\.]*$/))
input.throw(`Invalid condition name: [${name}]`)
assertOpeningBrace(false)
return {
type: ST.Condition,
conditionType,
name,
// depending on condition name, the content is eitherxpath or regexp (but never string)
content,
}
}
function parseVariable(input: Input) {
const { assertOpeningBrace, name: nameWithSuffix, content } = readRuleParts(input, 1)
let name = nameWithSuffix
let strategy = { append: false, weak: false }
// the order matters! "?+" is allowed but NOT "+?"
if (nameWithSuffix.at(-1) == '+') {
// name+ means append
strategy.append = true
name = name.slice(0, -1)
}
if (nameWithSuffix.at(-1) == '?') {
// name? means do not overwrite if non-empty
strategy.weak = true
name = name.slice(0, -1)
}
if (!name.match(/^[a-zA-Z]\w*$/))
input.throw(`Invalid variable name: [${name}]`)
assertOpeningBrace(false)
return {
type: ST.Variable,
name,
strategy: Object.keys(strategy).some(x => x) ? strategy : null,
content,
}
}
function parseProperty(input: Input) {
const { assertOpeningBrace, name: nameWithSuffix, content } = readRuleParts(input)
let name = nameWithSuffix
let force: 0 | 1 | 2 = 0
for (let i = 0; i < 2; i++) {
if (nameWithSuffix.at(-1) == '!') {
force++
name = name.slice(0, -1)
}
}
if (!name.match(/^[a-zA-Z]\w*$/))
input.throw(`Invalid property name: [${name}]`)
assertOpeningBrace(false)
return {
type: ST.Property,
name,
force,
content,
}
}
function parseAtRule(input: Input) {
const { assertOpeningBrace, name: nameWithArgs, content } = readRuleParts(input, 1)
let name = nameWithArgs
let args: IVArgument[] | null = null
// XXX: validate function signatures?
const lparenPos = findCharToken(nameWithArgs, '(')
if (lparenPos >= 0) {
if (nameWithArgs.at(-1) == ')') {
name = nameWithArgs.slice(0, lparenPos).trimEnd()
args = readArguments(nameWithArgs.slice(lparenPos + 1, -1))
}
}
if (!name.match(/^\w+$/))
input.throw(`Invalid function name: [${name}]`)
const isBlockFuncName = blockFunctionNameSet.has(name)
assertOpeningBrace(isBlockFuncName)
if (isBlockFuncName) {
if (args == null || args.length == 0) {
input.throw(`@${name} should have 1 argument`)
}
return {
type: ST.BlockFuncStart,
name,
query: args[0], // excess args are dropped
}
}
return {
type: ST.Function,
name,
content,
args,
}
}
function parseReplaceTagAbbr(input: Input) {
const tagEndPos = input.content.indexOf('>')
if (tagEndPos < 0) {
input.throw('> expected but not found')
}
const tag = input.content.slice(0, tagEndPos + 1)
const rest = input.content.slice(tagEndPos + 1)
// parse as if the current token were...
input.content = `@replace_tag(${tag})${rest}`
return parseAtRule(input)
}
function parseInclude(input: Input) {
// naive implementation that should not be used in real cases
return {
type: ST.Include,
content: input.content.slice(1).trimStart(),
}
}
function formatLineRange(a: number, b: number) {
return a == b ? `${a}` : `${a}-${b}`
}
function skipByPattern(str: string, regex: RegExp, pos: number) {
if (!regex.sticky) throw new Error('Cannot match without sticky flag set')
regex.lastIndex = pos
return regex.exec(str) ? regex.lastIndex : -1
}
function findStringExtent(str: string, pos: number = 0) {
const pat =
str[pos] == `"` ? /(?:\\.|[^"\\])*"/y :
str[pos] == `'` ? /(?:\\.|[^'\\])*'/y : null
if (pat == null) return -1
return skipByPattern(str, pat, pos + 1)
}
function readJSONString(str: string) {
if (str[0] != '"') return null
const extent = findStringExtent(str)
// assert that the quote char is at the end
if (extent != str.length) return null
try {
// use the built-in JSON parser to do the heavy lifting
return JSON.parse(str) as string
} catch (err) {
if (err instanceof SyntaxError) return null
throw err
}
}
function readString(str: string) {
// wow this is more convoluted than I have thought
const esc = (s: string) => `"${s.slice(1, -1).replace(/"/g, `\\"`).replace(/\\\'/g, `'`)}"`
try {
const ret =
str[0] == `"` ? JSON.parse(str) as string :
str[0] == `'` ? JSON.parse(esc(str)) as string : undefined
if (ret === undefined)
throw new Error('the input does not appear to be a valid string literal.')
return ret
} catch (err) {
if (err instanceof SyntaxError) return null
throw err
}
}
export function findCharToken(str: string, chr: string, pos: number = 0) {
if (chr.length != 1) throw new Error('Illegal char')
// FIXME: is this airtight?
if (chr == '\\' || chr == ']') chr = '\\' + chr
const pat = new RegExp(`[^'"${chr}]*`, 'y')
while (pos < str.length) {
let nextPos = skipByPattern(str, pat, pos)
if (nextPos == str.length) break
// report if the chr (instead of a quote) found
if (str[nextPos] == chr) {
pos = nextPos
break
}
// skip to where the string literal ends
nextPos = findStringExtent(str, nextPos)
// if there is no pairing quote, it does not count as a string literal
if (nextPos < 0) break
pos = nextPos
}
return str.indexOf(chr, pos)
}
export function stripTrailingComment(str: string) {
const commentStart = findCharToken(str, '#')
return commentStart >= 0 ? str.slice(0, commentStart).trimEnd() : str
}
/**
* We rely on the fact that the input and all the output parts are trimmed.
*/
export function readRuleParts(input: Input, pos: number = 0):
{ assertOpeningBrace: (yes: boolean) => void, name: string, content: string | null } {
const str = input.content.slice(pos)
const expectTrue = (t: boolean) => t || input.throw('Unexpected {')
const expectFalse = (f: boolean) => !f || input.throw('Expected { but found none')
// comment: the ref impl is super weird as you can see
if (str.at(-1) === '{') {
const name = str.slice(0, -1).trimEnd()
return {
assertOpeningBrace: expectTrue,
name,
content: null,
}
}
let colonPos = findCharToken(str, ':')
if (colonPos < 0) {
return {
assertOpeningBrace: expectFalse,
name: str,
content: null,
}
}
return {
assertOpeningBrace: expectFalse,
name: str.slice(0, colonPos).trimEnd(),
content: str.slice(colonPos + 1).trimStart(),
}
}
type IVArgument = string
export function readArguments(str: string) {
const args: IVArgument[] = []
let pos = 0
let trailingComma = false
while (pos < str.length) {
const startPos = skipByPattern(str, /\s*/y, pos)
let endPos
let content: IVArgument
let stringExtent = -1
// `"` or `""""` are treated as unquoted strings, WTF
if (str[startPos] == `'` || str[startPos] == `"`) {
const extent = findStringExtent(str, startPos)
if (extent == str.length || (extent >= 0 && str[extent].match(/[\s,]/))) {
stringExtent = extent
}
}
if (stringExtent >= 0) {
endPos = stringExtent
// the official engine treats it as an empty string when the parsing fails,
// which leads to some confusing error message like "Regexp pattern is empty"
content = readString(str.slice(startPos, endPos)) ?? ''
} else {
endPos = skipByPattern(str, /[^\s,]*/y, startPos)
if (endPos < 0) {
endPos = str.length
}
content = str.slice(startPos, endPos)
}
args.push(content)
// skip to the start of the next argument, consuming a comma if met
const nextPos = skipByPattern(str, /\s*/y, endPos)
if (nextPos < str.length && str[nextPos] == ',') {
trailingComma = true
pos = nextPos + 1
} else {
trailingComma = false
pos = nextPos
}
}
if (trailingComma) {
args.push('')
}
return args
}
import { Input } from './iv-template-line-parser'
const rand = (n: number) => Math.floor(Math.random() * n)
function referenceParser(str: string) {
function trimCont(s: string) {
const mat = s.match(/(\s*(?:\\|\\\r))$/)
if (mat) {
s = s.slice(0, s.length - mat[0].length)
}
return s
}
const xs = (str + '\n').split('\n').map(x => x.trim())
const result: string[] = []
for (const x of xs) {
if (result.length) {
const last = result[result.length - 1]
let s = trimCont(last)
if (last != s) {
result[result.length - 1] = s + x
continue
}
}
result.push(x)
}
return result.filter(x => x)
}
function fuzz() {
const source = Array.from({length: rand(3)}).map(() =>
' '.repeat(rand(3)) + 'A'.repeat(rand(3)) + ' '.repeat(rand(3)) +
['', '\\', '\\\r'][rand(3)] + ' '.repeat(rand(3))).join('\n') + '\n'.repeat(rand(3))
const ref = referenceParser(source)
let result: string[] | undefined
let err
try {
result = new Input(source, {debug: true}).parse().blocks.flatMap(blk => [...blk.conditions, ...blk.body]).map(x => x._source!)
} catch (e) {
err = (e as Error).message
}
if (result == null) {
// when parse fails
} else if (ref.length != result.length) {
err = 'Array length differs'
} else {
for (let i = 0; i < ref.length; i++) {
if (ref[i] != result[i]) {
err = `Incorrect line #${i}`
break
}
}
}
if (err) {
console.log('FAIL --> ' + err)
console.log(source.split('\n').map(s => `[${s}]`))
console.log('expected =', ref)
console.log('actual =', result)
throw 0
}
}
for (let i = 0; i < 100000; i++) {
fuzz()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment