Last active
February 21, 2018 16:24
-
-
Save arrieta/845646d71c6beee5c5f7cff95acca382 to your computer and use it in GitHub Desktop.
Lexer for the Planetary Data System (PDS) Object Description Language (ODL)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This program implements a lexer for the Object Description Language (ODL), a legacy metadata format | |
used by NASA's Planetary Data System (PDS). | |
As of this writing, the ODL version is 2.1, and the specification can be found in: | |
https://pds.jpl.nasa.gov/documents/sr/Chapter12.pdf | |
This lexer simply emits the tokens needed by an ODL parser, which is provided as a separate program. | |
(C) Nabla Zero Labs, 2018 | |
*/ | |
"use strict"; | |
const TOKEN_NAMES = [ | |
"NEWLINE", "TAB", "SPACE", | |
"COLON", "COMMA", "PERIOD", | |
"LPAR", "RPAR", "LCURLY", "RCURLY", "LSQUARE", "RSQUARE", | |
"LT", "GT", "EQUAL", | |
"PLUS", "DASH", "ASTERISK", "SLASH", | |
"CIRCUMFLEX", "AT", "HASH", "AMPERSAND", "DOLLAR", | |
"SQUOTE", "DQUOTE", | |
"INTEGER", "IDENTIFIER", "COMMENT", | |
"STRING", "SYMBOL" | |
]; | |
const make_tokens = function () { | |
let tokens = {}; | |
TOKEN_NAMES.forEach(function (name, index) { | |
tokens[name] = index; | |
}); | |
return Object.freeze(tokens); | |
}; | |
const Token = make_tokens(); | |
const ATOMS = { | |
"\n": Token.NEWLINE, | |
"\t": Token.TAB, | |
" ": Token.SPACE, | |
":": Token.COLON, | |
",": Token.COMMA, | |
".": Token.PERIOD, | |
"(": Token.LPAR, | |
")": Token.RPAR, | |
"{": Token.LCURLY, | |
"}": Token.RCURLY, | |
"[": Token.LSQUARE, | |
"]": Token.RSQUARE, | |
"<": Token.LT, | |
">": Token.GT, | |
"=": Token.EQUAL, | |
"+": Token.PLUS, | |
"-": Token.DASH, | |
"*": Token.ASTERISK, | |
"/": Token.SLASH, | |
"^": Token.CIRCUMFLEX, | |
"@": Token.AT, | |
"#": Token.HASH, | |
"&": Token.AMPERSAND, | |
"$": Token.DOLLAR, | |
"'": Token.SQUOTE, | |
"\"": Token.DQUOTE | |
}; | |
const is_alpha = (c) => (/^[A-Za-z]$/).test(c); | |
const is_digit = (c) => (/^[0-9]$/).test(c); | |
const is_identifier_char = (c) => (/^[A-Za-z0-9_]$/).test(c); | |
const is_space = (c) => ((c === " ") || (c === "\t") || (c === "\n") || (c === "\r")); | |
const lex = function (s) { | |
let at = 0; | |
const len = s.length; | |
const peek = () => s[at]; | |
const get = function () { | |
const c = s[at]; | |
at += 1; | |
return c; | |
}; | |
const error = function (m) { | |
throw {"what": "SyntaxError", "at": at, "message": m}; | |
}; | |
const comment = function () { | |
let token = {token: Token.COMMENT, lexeme: "", done: false}; | |
get(); // eat leading `*` | |
while (true) { | |
if (peek() === "*") { | |
get(); | |
if (peek() === "/") { | |
token.lexeme = token.lexeme.trim(); // remove whitespace padding | |
get(); | |
return token; | |
} | |
token.lexeme += "*"; | |
} | |
token.lexeme += get(); | |
} | |
}; | |
const delimited = function (delimiter) { | |
let token = {token: undefined, lexeme: "", done: false}; | |
while (peek() !== delimiter) { | |
token.lexeme += get(); | |
} | |
get(); // eat trailing delimiter | |
if (delimiter === "'") { | |
token.token = Token.SYMBOL; | |
} else if (delimiter === "\"") { | |
token.token = Token.STRING; | |
} else { | |
error("unknown delimiter syntax"); | |
} | |
return token; | |
}; | |
const next = function () { | |
let token = undefined; | |
while (at < len) { | |
while (is_space(peek())) { | |
get(); | |
} | |
let c = get(); | |
let t = ATOMS[c]; | |
if (t !== undefined) { | |
if ((c === "\"") || (c === "'")) { | |
return delimited(c); | |
} | |
if ((c === "/") && peek() === "*") { | |
return comment(); | |
} | |
return {token: t, lexeme: c, done: false}; | |
} | |
if (is_alpha(c)) { | |
token = {token: Token.IDENTIFIER, lexeme: c, done: false}; | |
while (is_identifier_char(peek())) { | |
token.lexeme += get(); | |
} | |
if (token.lexeme.slice(-1) === "_") { | |
error("Identifiers cannot end with an underscore."); | |
} | |
return token; | |
} | |
if (is_digit(c)) { | |
token = {token: Token.INTEGER, lexeme: c, done: false}; | |
while (is_digit(peek())) { | |
token.lexeme += get(); | |
} | |
return token; | |
} | |
if (c === undefined) { | |
break; | |
} | |
error(`unexpected character ${c}`); | |
} | |
// iterator sentinel | |
return {token: undefined, lexeme: undefined, done: true}; | |
}; | |
return {next: next}; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Given the input:
The program
produces the output