Created
July 21, 2021 20:11
-
-
Save jordanbtucker/441fee538bd222cd6e9be0c061a3792e to your computer and use it in GitHub Desktop.
JSON5 parsing simple references as strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// see https://github.com/json5/json5/issues/249 | |
const util = require('./util') | |
let source | |
let parseState | |
let stack | |
let pos | |
let line | |
let column | |
let token | |
let key | |
let root | |
module.exports = function parse (text, reviver) { | |
source = String(text) | |
parseState = 'start' | |
stack = [] | |
pos = 0 | |
line = 1 | |
column = 0 | |
token = undefined | |
key = undefined | |
root = undefined | |
do { | |
token = lex() | |
// This code is unreachable. | |
// if (!parseStates[parseState]) { | |
// throw invalidParseState() | |
// } | |
parseStates[parseState]() | |
} while (token.type !== 'eof') | |
if (typeof reviver === 'function') { | |
return internalize({'': root}, '', reviver) | |
} | |
return root | |
} | |
function internalize (holder, name, reviver) { | |
const value = holder[name] | |
if (value != null && typeof value === 'object') { | |
for (const key in value) { | |
const replacement = internalize(value, key, reviver) | |
if (replacement === undefined) { | |
delete value[key] | |
} else { | |
value[key] = replacement | |
} | |
} | |
} | |
return reviver.call(holder, name, value) | |
} | |
let lexState | |
let buffer | |
let doubleQuote | |
let sign | |
let c | |
function lex () { | |
lexState = 'default' | |
buffer = '' | |
doubleQuote = false | |
sign = 1 | |
for (;;) { | |
c = peek() | |
// This code is unreachable. | |
// if (!lexStates[lexState]) { | |
// throw invalidLexState(lexState) | |
// } | |
const token = lexStates[lexState]() | |
if (token) { | |
return token | |
} | |
} | |
} | |
function peek () { | |
if (source[pos]) { | |
return String.fromCodePoint(source.codePointAt(pos)) | |
} | |
} | |
function read () { | |
const c = peek() | |
if (c === '\n') { | |
line++ | |
column = 0 | |
} else if (c) { | |
column += c.length | |
} else { | |
column++ | |
} | |
if (c) { | |
pos += c.length | |
} | |
return c | |
} | |
const lexStates = { | |
default () { | |
switch (c) { | |
case '\t': | |
case '\v': | |
case '\f': | |
case ' ': | |
case '\u00A0': | |
case '\uFEFF': | |
case '\n': | |
case '\r': | |
case '\u2028': | |
case '\u2029': | |
read() | |
return | |
case '/': | |
read() | |
lexState = 'comment' | |
return | |
case undefined: | |
read() | |
return newToken('eof') | |
} | |
if (util.isSpaceSeparator(c)) { | |
read() | |
return | |
} | |
// This code is unreachable. | |
// if (!lexStates[parseState]) { | |
// throw invalidLexState(parseState) | |
// } | |
return lexStates[parseState]() | |
}, | |
comment () { | |
switch (c) { | |
case '*': | |
read() | |
lexState = 'multiLineComment' | |
return | |
case '/': | |
read() | |
lexState = 'singleLineComment' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
multiLineComment () { | |
switch (c) { | |
case '*': | |
read() | |
lexState = 'multiLineCommentAsterisk' | |
return | |
case undefined: | |
throw invalidChar(read()) | |
} | |
read() | |
}, | |
multiLineCommentAsterisk () { | |
switch (c) { | |
case '*': | |
read() | |
return | |
case '/': | |
read() | |
lexState = 'default' | |
return | |
case undefined: | |
throw invalidChar(read()) | |
} | |
read() | |
lexState = 'multiLineComment' | |
}, | |
singleLineComment () { | |
switch (c) { | |
case '\n': | |
case '\r': | |
case '\u2028': | |
case '\u2029': | |
read() | |
lexState = 'default' | |
return | |
case undefined: | |
read() | |
return newToken('eof') | |
} | |
read() | |
}, | |
value () { | |
switch (c) { | |
case '{': | |
case '[': | |
return newToken('punctuator', read()) | |
case '-': | |
case '+': | |
if (read() === '-') { | |
sign = -1 | |
} | |
lexState = 'sign' | |
return | |
case '.': | |
buffer = read() | |
lexState = 'decimalPointLeading' | |
return | |
case '0': | |
buffer = read() | |
lexState = 'zero' | |
return | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
buffer = read() | |
lexState = 'decimalInteger' | |
return | |
case '"': | |
case "'": | |
doubleQuote = (read() === '"') | |
buffer = '' | |
lexState = 'string' | |
return | |
case '$': | |
case '_': | |
buffer = read() | |
lexState = 'identifierName' | |
return | |
case '\\': | |
read() | |
lexState = 'identifierNameStartEscape' | |
return | |
} | |
if (util.isIdStartChar(c)) { | |
buffer += read() | |
lexState = 'identifierName' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
identifierNameStartEscape () { | |
if (c !== 'u') { | |
throw invalidChar(read()) | |
} | |
read() | |
const u = unicodeEscape() | |
switch (u) { | |
case '$': | |
case '_': | |
break | |
default: | |
if (!util.isIdStartChar(u)) { | |
throw invalidIdentifier() | |
} | |
break | |
} | |
buffer += u | |
lexState = 'identifierName' | |
}, | |
identifierName () { | |
switch (c) { | |
case '$': | |
case '_': | |
case '\u200C': | |
case '\u200D': | |
buffer += read() | |
return | |
case '\\': | |
read() | |
lexState = 'identifierNameEscape' | |
return | |
case '.': | |
if (parseState === 'beforePropertyValue' || parseState === 'beforeArrayValue') { | |
lexState = 'identifierNameDot' | |
buffer += read() | |
return | |
} | |
return newToken('identifier', buffer) | |
} | |
if (util.isIdContinueChar(c)) { | |
buffer += read() | |
return | |
} | |
return newToken('identifier', buffer) | |
}, | |
identifierNameDot () { | |
switch (c) { | |
case '$': | |
case '_': | |
buffer += read() | |
lexState = 'identifierName' | |
return | |
case '\\': | |
read() | |
lexState = 'identifierNameStartEscape' | |
return | |
} | |
if (util.isIdStartChar(c)) { | |
buffer += read() | |
lexState = 'identifierName' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
identifierNameEscape () { | |
if (c !== 'u') { | |
throw invalidChar(read()) | |
} | |
read() | |
const u = unicodeEscape() | |
switch (u) { | |
case '$': | |
case '_': | |
case '\u200C': | |
case '\u200D': | |
break | |
default: | |
if (!util.isIdContinueChar(u)) { | |
throw invalidIdentifier() | |
} | |
break | |
} | |
buffer += u | |
lexState = 'identifierName' | |
}, | |
sign () { | |
switch (c) { | |
case '.': | |
buffer = read() | |
lexState = 'decimalPointLeading' | |
return | |
case '0': | |
buffer = read() | |
lexState = 'zero' | |
return | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
buffer = read() | |
lexState = 'decimalInteger' | |
return | |
case 'I': | |
read() | |
literal('nfinity') | |
return newToken('numeric', sign * Infinity) | |
case 'N': | |
read() | |
literal('aN') | |
return newToken('numeric', NaN) | |
} | |
throw invalidChar(read()) | |
}, | |
zero () { | |
switch (c) { | |
case '.': | |
buffer += read() | |
lexState = 'decimalPoint' | |
return | |
case 'e': | |
case 'E': | |
buffer += read() | |
lexState = 'decimalExponent' | |
return | |
case 'x': | |
case 'X': | |
buffer += read() | |
lexState = 'hexadecimal' | |
return | |
} | |
return newToken('numeric', sign * 0) | |
}, | |
decimalInteger () { | |
switch (c) { | |
case '.': | |
buffer += read() | |
lexState = 'decimalPoint' | |
return | |
case 'e': | |
case 'E': | |
buffer += read() | |
lexState = 'decimalExponent' | |
return | |
} | |
if (util.isDigit(c)) { | |
buffer += read() | |
return | |
} | |
return newToken('numeric', sign * Number(buffer)) | |
}, | |
decimalPointLeading () { | |
if (util.isDigit(c)) { | |
buffer += read() | |
lexState = 'decimalFraction' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
decimalPoint () { | |
switch (c) { | |
case 'e': | |
case 'E': | |
buffer += read() | |
lexState = 'decimalExponent' | |
return | |
} | |
if (util.isDigit(c)) { | |
buffer += read() | |
lexState = 'decimalFraction' | |
return | |
} | |
return newToken('numeric', sign * Number(buffer)) | |
}, | |
decimalFraction () { | |
switch (c) { | |
case 'e': | |
case 'E': | |
buffer += read() | |
lexState = 'decimalExponent' | |
return | |
} | |
if (util.isDigit(c)) { | |
buffer += read() | |
return | |
} | |
return newToken('numeric', sign * Number(buffer)) | |
}, | |
decimalExponent () { | |
switch (c) { | |
case '+': | |
case '-': | |
buffer += read() | |
lexState = 'decimalExponentSign' | |
return | |
} | |
if (util.isDigit(c)) { | |
buffer += read() | |
lexState = 'decimalExponentInteger' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
decimalExponentSign () { | |
if (util.isDigit(c)) { | |
buffer += read() | |
lexState = 'decimalExponentInteger' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
decimalExponentInteger () { | |
if (util.isDigit(c)) { | |
buffer += read() | |
return | |
} | |
return newToken('numeric', sign * Number(buffer)) | |
}, | |
hexadecimal () { | |
if (util.isHexDigit(c)) { | |
buffer += read() | |
lexState = 'hexadecimalInteger' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
hexadecimalInteger () { | |
if (util.isHexDigit(c)) { | |
buffer += read() | |
return | |
} | |
return newToken('numeric', sign * Number(buffer)) | |
}, | |
string () { | |
switch (c) { | |
case '\\': | |
read() | |
buffer += escape() | |
return | |
case '"': | |
if (doubleQuote) { | |
read() | |
return newToken('string', buffer) | |
} | |
buffer += read() | |
return | |
case "'": | |
if (!doubleQuote) { | |
read() | |
return newToken('string', buffer) | |
} | |
buffer += read() | |
return | |
case '\n': | |
case '\r': | |
throw invalidChar(read()) | |
case '\u2028': | |
case '\u2029': | |
separatorChar(c) | |
break | |
case undefined: | |
throw invalidChar(read()) | |
} | |
buffer += read() | |
}, | |
start () { | |
switch (c) { | |
case '{': | |
case '[': | |
return newToken('punctuator', read()) | |
// This code is unreachable since the default lexState handles eof. | |
// case undefined: | |
// return newToken('eof') | |
} | |
lexState = 'value' | |
}, | |
beforePropertyName () { | |
switch (c) { | |
case '$': | |
case '_': | |
buffer = read() | |
lexState = 'identifierName' | |
return | |
case '\\': | |
read() | |
lexState = 'identifierNameStartEscape' | |
return | |
case '}': | |
return newToken('punctuator', read()) | |
case '"': | |
case "'": | |
doubleQuote = (read() === '"') | |
lexState = 'string' | |
return | |
} | |
if (util.isIdStartChar(c)) { | |
buffer += read() | |
lexState = 'identifierName' | |
return | |
} | |
throw invalidChar(read()) | |
}, | |
afterPropertyName () { | |
if (c === ':') { | |
return newToken('punctuator', read()) | |
} | |
throw invalidChar(read()) | |
}, | |
beforePropertyValue () { | |
lexState = 'value' | |
}, | |
afterPropertyValue () { | |
switch (c) { | |
case ',': | |
case '}': | |
return newToken('punctuator', read()) | |
} | |
throw invalidChar(read()) | |
}, | |
beforeArrayValue () { | |
if (c === ']') { | |
return newToken('punctuator', read()) | |
} | |
lexState = 'value' | |
}, | |
afterArrayValue () { | |
switch (c) { | |
case ',': | |
case ']': | |
return newToken('punctuator', read()) | |
} | |
throw invalidChar(read()) | |
}, | |
end () { | |
// This code is unreachable since it's handled by the default lexState. | |
// if (c === undefined) { | |
// read() | |
// return newToken('eof') | |
// } | |
throw invalidChar(read()) | |
}, | |
} | |
function newToken (type, value) { | |
return { | |
type, | |
value, | |
line, | |
column, | |
} | |
} | |
function literal (s) { | |
for (const c of s) { | |
const p = peek() | |
if (p !== c) { | |
throw invalidChar(read()) | |
} | |
read() | |
} | |
} | |
function escape () { | |
const c = peek() | |
switch (c) { | |
case 'b': | |
read() | |
return '\b' | |
case 'f': | |
read() | |
return '\f' | |
case 'n': | |
read() | |
return '\n' | |
case 'r': | |
read() | |
return '\r' | |
case 't': | |
read() | |
return '\t' | |
case 'v': | |
read() | |
return '\v' | |
case '0': | |
read() | |
if (util.isDigit(peek())) { | |
throw invalidChar(read()) | |
} | |
return '\0' | |
case 'x': | |
read() | |
return hexEscape() | |
case 'u': | |
read() | |
return unicodeEscape() | |
case '\n': | |
case '\u2028': | |
case '\u2029': | |
read() | |
return '' | |
case '\r': | |
read() | |
if (peek() === '\n') { | |
read() | |
} | |
return '' | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
throw invalidChar(read()) | |
case undefined: | |
throw invalidChar(read()) | |
} | |
return read() | |
} | |
function hexEscape () { | |
let buffer = '' | |
let c = peek() | |
if (!util.isHexDigit(c)) { | |
throw invalidChar(read()) | |
} | |
buffer += read() | |
c = peek() | |
if (!util.isHexDigit(c)) { | |
throw invalidChar(read()) | |
} | |
buffer += read() | |
return String.fromCodePoint(parseInt(buffer, 16)) | |
} | |
function unicodeEscape () { | |
let buffer = '' | |
let count = 4 | |
while (count-- > 0) { | |
const c = peek() | |
if (!util.isHexDigit(c)) { | |
throw invalidChar(read()) | |
} | |
buffer += read() | |
} | |
return String.fromCodePoint(parseInt(buffer, 16)) | |
} | |
const parseStates = { | |
start () { | |
if (token.type === 'eof') { | |
throw invalidEOF() | |
} | |
convertIdentifierToValue() | |
push() | |
}, | |
beforePropertyName () { | |
switch (token.type) { | |
case 'identifier': | |
case 'string': | |
key = token.value | |
parseState = 'afterPropertyName' | |
return | |
case 'punctuator': | |
// This code is unreachable since it's handled by the lexState. | |
// if (token.value !== '}') { | |
// throw invalidToken() | |
// } | |
pop() | |
return | |
case 'eof': | |
throw invalidEOF() | |
} | |
// This code is unreachable since it's handled by the lexState. | |
// throw invalidToken() | |
}, | |
afterPropertyName () { | |
// This code is unreachable since it's handled by the lexState. | |
// if (token.type !== 'punctuator' || token.value !== ':') { | |
// throw invalidToken() | |
// } | |
if (token.type === 'eof') { | |
throw invalidEOF() | |
} | |
parseState = 'beforePropertyValue' | |
}, | |
beforePropertyValue () { | |
if (token.type === 'eof') { | |
throw invalidEOF() | |
} | |
convertIdentifierToValue() | |
push() | |
}, | |
beforeArrayValue () { | |
if (token.type === 'eof') { | |
throw invalidEOF() | |
} | |
if (token.type === 'punctuator' && token.value === ']') { | |
pop() | |
return | |
} | |
convertIdentifierToValue() | |
push() | |
}, | |
afterPropertyValue () { | |
// This code is unreachable since it's handled by the lexState. | |
// if (token.type !== 'punctuator') { | |
// throw invalidToken() | |
// } | |
if (token.type === 'eof') { | |
throw invalidEOF() | |
} | |
switch (token.value) { | |
case ',': | |
parseState = 'beforePropertyName' | |
return | |
case '}': | |
pop() | |
} | |
// This code is unreachable since it's handled by the lexState. | |
// throw invalidToken() | |
}, | |
afterArrayValue () { | |
// This code is unreachable since it's handled by the lexState. | |
// if (token.type !== 'punctuator') { | |
// throw invalidToken() | |
// } | |
if (token.type === 'eof') { | |
throw invalidEOF() | |
} | |
switch (token.value) { | |
case ',': | |
parseState = 'beforeArrayValue' | |
return | |
case ']': | |
pop() | |
} | |
// This code is unreachable since it's handled by the lexState. | |
// throw invalidToken() | |
}, | |
end () { | |
// This code is unreachable since it's handled by the lexState. | |
// if (token.type !== 'eof') { | |
// throw invalidToken() | |
// } | |
}, | |
} | |
function convertIdentifierToValue () { | |
if (token.type === 'identifier') { | |
switch (token.value) { | |
case 'null': | |
token.type = 'null' | |
token.value = null | |
return | |
case 'true': | |
token.type = 'boolean' | |
token.value = true | |
return | |
case 'false': | |
token.type = 'boolean' | |
token.value = false | |
return | |
case 'Infinity': | |
token.type = 'numeric' | |
token.value = Infinity | |
return | |
case 'NaN': | |
token.type = 'numeric' | |
token.value = NaN | |
} | |
token.type = 'string' | |
} | |
} | |
function push () { | |
let value | |
switch (token.type) { | |
case 'punctuator': | |
switch (token.value) { | |
case '{': | |
value = {} | |
break | |
case '[': | |
value = [] | |
break | |
} | |
break | |
case 'null': | |
case 'boolean': | |
case 'numeric': | |
case 'string': | |
value = token.value | |
break | |
// This code is unreachable. | |
// default: | |
// throw invalidToken() | |
} | |
if (root === undefined) { | |
root = value | |
} else { | |
const parent = stack[stack.length - 1] | |
if (Array.isArray(parent)) { | |
parent.push(value) | |
} else { | |
parent[key] = value | |
} | |
} | |
if (value !== null && typeof value === 'object') { | |
stack.push(value) | |
if (Array.isArray(value)) { | |
parseState = 'beforeArrayValue' | |
} else { | |
parseState = 'beforePropertyName' | |
} | |
} else { | |
const current = stack[stack.length - 1] | |
if (current == null) { | |
parseState = 'end' | |
} else if (Array.isArray(current)) { | |
parseState = 'afterArrayValue' | |
} else { | |
parseState = 'afterPropertyValue' | |
} | |
} | |
} | |
function pop () { | |
stack.pop() | |
const current = stack[stack.length - 1] | |
if (current == null) { | |
parseState = 'end' | |
} else if (Array.isArray(current)) { | |
parseState = 'afterArrayValue' | |
} else { | |
parseState = 'afterPropertyValue' | |
} | |
} | |
// This code is unreachable. | |
// function invalidParseState () { | |
// return new Error(`JSON5: invalid parse state '${parseState}'`) | |
// } | |
// This code is unreachable. | |
// function invalidLexState (state) { | |
// return new Error(`JSON5: invalid lex state '${state}'`) | |
// } | |
function invalidChar (c) { | |
if (c === undefined) { | |
return syntaxError(`JSON5: invalid end of input at ${line}:${column}`) | |
} | |
return syntaxError(`JSON5: invalid character '${formatChar(c)}' at ${line}:${column}`) | |
} | |
function invalidEOF () { | |
return syntaxError(`JSON5: invalid end of input at ${line}:${column}`) | |
} | |
// This code is unreachable. | |
// function invalidToken () { | |
// if (token.type === 'eof') { | |
// return syntaxError(`JSON5: invalid end of input at ${line}:${column}`) | |
// } | |
// const c = String.fromCodePoint(token.value.codePointAt(0)) | |
// return syntaxError(`JSON5: invalid character '${formatChar(c)}' at ${line}:${column}`) | |
// } | |
function invalidIdentifier () { | |
column -= 5 | |
return syntaxError(`JSON5: invalid identifier character at ${line}:${column}`) | |
} | |
function separatorChar (c) { | |
console.warn(`JSON5: '${formatChar(c)}' in strings is not valid ECMAScript; consider escaping`) | |
} | |
function formatChar (c) { | |
const replacements = { | |
"'": "\\'", | |
'"': '\\"', | |
'\\': '\\\\', | |
'\b': '\\b', | |
'\f': '\\f', | |
'\n': '\\n', | |
'\r': '\\r', | |
'\t': '\\t', | |
'\v': '\\v', | |
'\0': '\\0', | |
'\u2028': '\\u2028', | |
'\u2029': '\\u2029', | |
} | |
if (replacements[c]) { | |
return replacements[c] | |
} | |
if (c < ' ') { | |
const hexString = c.charCodeAt(0).toString(16) | |
return '\\x' + ('00' + hexString).substring(hexString.length) | |
} | |
return c | |
} | |
function syntaxError (message) { | |
const err = new SyntaxError(message) | |
err.lineNumber = line | |
err.columnNumber = column | |
return err | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment