arrieta · February 21, 2018 16:24 · arrieta · Feb 21, 2018
diff --git a/pds.js b/pds.js
 /*
 This program implements a lexer for the Object Description Language (ODL), a legacy metadata format
 used by NASA's Planetary Data System (PDS).

 As of this writing, the ODL version is 2.1, and the specification can be found in:

    https://pds.jpl.nasa.gov/documents/sr/Chapter12.pdf

 This lexer simply emits the tokens needed by an ODL parser, which is provided as a separate program.

 (C) Nabla Zero Labs, 2018

 */

 "use strict";

 const TOKEN_NAMES = [
    "NEWLINE", "TAB", "SPACE",
    "COLON", "COMMA", "PERIOD",
    "LPAR", "RPAR", "LCURLY", "RCURLY", "LSQUARE", "RSQUARE",
    "LT", "GT", "EQUAL",
    "PLUS", "DASH", "ASTERISK", "SLASH",
    "CIRCUMFLEX", "AT", "HASH", "AMPERSAND", "DOLLAR",
    "SQUOTE", "DQUOTE",
    "INTEGER", "IDENTIFIER", "COMMENT",
    "STRING", "SYMBOL"
 ];

 const make_tokens = function () {
    let tokens = {};
    TOKEN_NAMES.forEach(function (name, index) {
        tokens[name] = index;
    });
    return Object.freeze(tokens);
 };

 const Token = make_tokens();

 const ATOMS = {
    "\n": Token.NEWLINE,
    "\t": Token.TAB,
    " ": Token.SPACE,
    ":": Token.COLON,
    ",": Token.COMMA,
    ".": Token.PERIOD,
    "(": Token.LPAR,
    ")": Token.RPAR,
    "{": Token.LCURLY,
    "}": Token.RCURLY,
    "[": Token.LSQUARE,
    "]": Token.RSQUARE,
    "<": Token.LT,
    ">": Token.GT,
    "=": Token.EQUAL,
    "+": Token.PLUS,
    "-": Token.DASH,
    "*": Token.ASTERISK,
    "/": Token.SLASH,
    "^": Token.CIRCUMFLEX,
    "@": Token.AT,
    "#": Token.HASH,
    "&": Token.AMPERSAND,
    "$": Token.DOLLAR,
    "'": Token.SQUOTE,
    "\"": Token.DQUOTE
 };

 const is_alpha = (c) => (/^[A-Za-z]$/).test(c);
 const is_digit = (c) => (/^[0-9]$/).test(c);
 const is_identifier_char = (c) => (/^[A-Za-z0-9_]$/).test(c);
 const is_space = (c) => ((c === " ") || (c === "\t") || (c === "\n") || (c === "\r"));

 const lex = function (s) {
    let at = 0;
    const len = s.length;
    const peek = () => s[at];

    const get = function () {
        const c = s[at];
        at += 1;
        return c;
    };

    const error = function (m) {
        throw {"what": "SyntaxError", "at": at, "message": m};
    };

    const comment = function () {
        let token = {token: Token.COMMENT, lexeme: "", done: false};
        get(); // eat leading `*`
        while (true) {
            if (peek() === "*") {
                get();
                if (peek() === "/") {
                    token.lexeme = token.lexeme.trim(); // remove whitespace padding                                        
                    get();
                    return token;
                }
                token.lexeme += "*";
            }
            token.lexeme += get();
        }
    };

    const delimited = function (delimiter) {
        let token = {token: undefined, lexeme: "", done: false};
        while (peek() !== delimiter) {
            token.lexeme += get();
        }
        get(); // eat trailing delimiter
        if (delimiter === "'") {
            token.token = Token.SYMBOL;
        } else if (delimiter === "\"") {
            token.token = Token.STRING;
        } else {
            error("unknown delimiter syntax");
        }
        return token;
    };

    const next = function () {

        let token = undefined;

        while (at < len) {

            while (is_space(peek())) {
                get();
            }

            let c = get();
            let t = ATOMS[c];

            if (t !== undefined) {
                if ((c === "\"") || (c === "'")) {
                    return delimited(c);
                }
                if ((c === "/") && peek() === "*") {
                    return comment();
                }
                return {token: t, lexeme: c, done: false};
            }

            if (is_alpha(c)) {
                token = {token: Token.IDENTIFIER, lexeme: c, done: false};
                while (is_identifier_char(peek())) {
                    token.lexeme += get();
                }
                if (token.lexeme.slice(-1) === "_") {
                    error("Identifiers cannot end with an underscore.");
                }
                return token;
            }

            if (is_digit(c)) {
                token = {token: Token.INTEGER, lexeme: c, done: false};
                while (is_digit(peek())) {
                    token.lexeme += get();
                }
                return token;
            }

            if (c === undefined) {
                break;
            }
            error(`unexpected character ${c}`);
        }
        // iterator sentinel
        return {token: undefined, lexeme: undefined, done: true};
    };

    return {next: next};
 };
	/*
	This program implements a lexer for the Object Description Language (ODL), a legacy metadata format
	used by NASA's Planetary Data System (PDS).

	As of this writing, the ODL version is 2.1, and the specification can be found in:

	https://pds.jpl.nasa.gov/documents/sr/Chapter12.pdf

	This lexer simply emits the tokens needed by an ODL parser, which is provided as a separate program.

	(C) Nabla Zero Labs, 2018

	*/

	"use strict";

	const TOKEN_NAMES = [
	"NEWLINE", "TAB", "SPACE",
	"COLON", "COMMA", "PERIOD",
	"LPAR", "RPAR", "LCURLY", "RCURLY", "LSQUARE", "RSQUARE",
	"LT", "GT", "EQUAL",
	"PLUS", "DASH", "ASTERISK", "SLASH",
	"CIRCUMFLEX", "AT", "HASH", "AMPERSAND", "DOLLAR",
	"SQUOTE", "DQUOTE",
	"INTEGER", "IDENTIFIER", "COMMENT",
	"STRING", "SYMBOL"
	];

	const make_tokens = function () {
	let tokens = {};
	TOKEN_NAMES.forEach(function (name, index) {
	tokens[name] = index;
	});
	return Object.freeze(tokens);
	};

	const Token = make_tokens();

	const ATOMS = {
	"\n": Token.NEWLINE,
	"\t": Token.TAB,
	" ": Token.SPACE,
	":": Token.COLON,
	",": Token.COMMA,
	".": Token.PERIOD,
	"(": Token.LPAR,
	")": Token.RPAR,
	"{": Token.LCURLY,
	"}": Token.RCURLY,
	"[": Token.LSQUARE,
	"]": Token.RSQUARE,
	"<": Token.LT,
	">": Token.GT,
	"=": Token.EQUAL,
	"+": Token.PLUS,
	"-": Token.DASH,
	"*": Token.ASTERISK,
	"/": Token.SLASH,
	"^": Token.CIRCUMFLEX,
	"@": Token.AT,
	"#": Token.HASH,
	"&": Token.AMPERSAND,
	"$": Token.DOLLAR,
	"'": Token.SQUOTE,
	"\"": Token.DQUOTE
	};

	const is_alpha = (c) => (/^[A-Za-z]$/).test(c);
	const is_digit = (c) => (/^[0-9]$/).test(c);
	const is_identifier_char = (c) => (/^[A-Za-z0-9_]$/).test(c);
	const is_space = (c) => ((c === " ") \|\| (c === "\t") \|\| (c === "\n") \|\| (c === "\r"));

	const lex = function (s) {
	let at = 0;
	const len = s.length;
	const peek = () => s[at];

	const get = function () {
	const c = s[at];
	at += 1;
	return c;
	};

	const error = function (m) {
	throw {"what": "SyntaxError", "at": at, "message": m};
	};

	const comment = function () {
	let token = {token: Token.COMMENT, lexeme: "", done: false};
	get(); // eat leading `*`
	while (true) {
	if (peek() === "*") {
	get();
	if (peek() === "/") {
	token.lexeme = token.lexeme.trim(); // remove whitespace padding
	get();
	return token;
	}
	token.lexeme += "*";
	}
	token.lexeme += get();
	}
	};

	const delimited = function (delimiter) {
	let token = {token: undefined, lexeme: "", done: false};
	while (peek() !== delimiter) {
	token.lexeme += get();
	}
	get(); // eat trailing delimiter
	if (delimiter === "'") {
	token.token = Token.SYMBOL;
	} else if (delimiter === "\"") {
	token.token = Token.STRING;
	} else {
	error("unknown delimiter syntax");
	}
	return token;
	};

	const next = function () {

	let token = undefined;

	while (at < len) {

	while (is_space(peek())) {
	get();
	}

	let c = get();
	let t = ATOMS[c];

	if (t !== undefined) {
	if ((c === "\"") \|\| (c === "'")) {
	return delimited(c);
	}
	if ((c === "/") && peek() === "*") {
	return comment();
	}
	return {token: t, lexeme: c, done: false};
	}

	if (is_alpha(c)) {
	token = {token: Token.IDENTIFIER, lexeme: c, done: false};
	while (is_identifier_char(peek())) {
	token.lexeme += get();
	}
	if (token.lexeme.slice(-1) === "_") {
	error("Identifiers cannot end with an underscore.");
	}
	return token;
	}

	if (is_digit(c)) {
	token = {token: Token.INTEGER, lexeme: c, done: false};
	while (is_digit(peek())) {
	token.lexeme += get();
	}
	return token;
	}

	if (c === undefined) {
	break;
	}
	error(`unexpected character ${c}`);
	}
	// iterator sentinel
	return {token: undefined, lexeme: undefined, done: true};
	};

	return {next: next};
	};