1rosehip · October 5, 2019 16:17
diff --git a/lex.html b/lex.html
 <!DOCTYPE html>
 <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Lexical Analysis</title>
    </head>
    <body>
        <script src="lex.js"></script>
    </body>
 </html>
diff --git a/lex.js b/lex.js
 class Lexer{

    /**
     * @param {string} input - input to find tokens in; TODO: rename it to request?
     */
    constructor(input) {

        // input to find tokens in; TODO: rename it to request?
        this.input = input;

        //symbol table to keep lexemes
        this.symbolTable = [];
        this.debug = [];

        // regular definitions
        this.regularDefinitions = {

            // A whitespace (\s) in a regular expression matches any one of the characters: space, formfeed (\f), newline (\n), return (\r), tab (\t), vertical tab (\v), non-breaking space (\xA0), as well as the Unicode characters \u00A0 \u2028 \u2029.
            ws: /^\s*$/

            //,letter: /[A-Za-z]/ // used in id
            //,digit: /\d/ // used in id
            ,id: /^[A-Za-z]([A-Za-z]|\d)*$/  // {letter}({letter}|{digit})  //

            /*
            ,get id() {
                return new RegExp(`^${this.letter}(${this.letter}|\\d)*`);
            }*/

            ,number: /\d+(\.\d+)?/g // {digit}+(\.{digit+})?

        };

        this.transitionRules = {

            ws: 'WHITESPACE',
            '/*': 'COMMENT',

            '<=': 'RELOP',
            '<': 'RELOP',
            '==': 'RELOP',
            '!=': 'RELOP',
            '>=': 'RELOP',
            '>': 'RELOP',

            '(': 'PARENTHESES',
            ')': 'PARENTHESES',
            '{': 'PARENTHESES',
            '}': 'PARENTHESES',

            ':': 'PUNCTUATION',
            ';': 'PUNCTUATION',
            ',': 'PUNCTUATION',
            '?': 'PUNCTUATION',

            id: 'ID',
            number: 'NUMBER'

            // OPERATORS: *, +, -, \, ^, ...
        };

        // current lexeme and token name
        this.lexeme = '';
        this.tokenName = '';
        this.lineNumber = 0;

        this.index = 0;
        this.mode = 0; // 0 - in progress, 1 - error, 2 - success
    }

    /**
     * try to look next symbol
     * @param {number} nextIndex
     * @return {string|null}
     */
    getNextChar(nextIndex){
        return nextIndex < this.input.length ? this.input[nextIndex] : null;
    }

    /**
     * search for ending of the comment
     */
    lookAheadCommentEnd(initialIndex){

        // find substring [initialIndex, end]
        const subs = this.input.substring(initialIndex);

        // find index of */ in [initialIndex, end]
        const newIndex = subs.indexOf('*/') + 1;

        return {
            lexeme: newIndex !== -1 ? subs.substring(0, newIndex + 1) : '',
            index: newIndex !== -1 ? initialIndex + newIndex : -1
        };
    }

    /**
     * look ahead, recursive
     * @param {number} initialIndex
     * @param {RegExp} regularDefinitionRegex
     * @param {string} lexeme
     * @return {object} {lexeme, index}
     */
    lookAhead(initialIndex, regularDefinitionRegex, lexeme){

        const nextChar = this.getNextChar(initialIndex + 1);

        if(nextChar === null || !regularDefinitionRegex.test(lexeme + nextChar)){

            return {
                lexeme,
                index: initialIndex
            };
        }

        return this.lookAhead(initialIndex + 1, regularDefinitionRegex, lexeme + nextChar);
    }

    /**
     * add current token and lexeme to the symbol table
     */
    updateSymbolTable(){
        this.symbolTable.push({
            token: this.tokenName,
            lexeme: this.lexeme,
            lineNumber: this.lineNumber
        });
    }

    /**
     * add current token and lexeme to the debug table
     */
    updateDebug(){
        this.debug.push({
            token: this.tokenName,
            lexeme: this.lexeme,
            lineNumber: this.lineNumber
        });
    }

    /**
     * handle comment like /* ... *\/
     * @return {boolean} true if comment has finished
     */
    handleComment(){

        const res = this.lookAheadCommentEnd(this.index-2);
        const isFinished = res && res.index > this.index;

        if (!isFinished) return isFinished;

        this.lexeme = res.lexeme;
        this.index = res.index;

        // add result to the tokens list for debug
        this.updateDebug();

        // reset the lexeme
        this.lexeme = '';

        return true;
    }

    /**
     * handle constant like >, <, ...
     */
    handleConstant(){

        // save result to the symbol and debug table
        this.updateSymbolTable();
        this.updateDebug();

        // reset the lexeme
        this.lexeme = '';
    }

    /**
     * handle regex rule
     * @param {string} ruleName
     * @return {boolean} true if comment has found
     */
    handleRegexRule(ruleName){

        const regularDefinitionRegex = this.regularDefinitions[ruleName];
        const isFound = regularDefinitionRegex && regularDefinitionRegex.test(this.lexeme);
        if(!isFound) return isFound;

        const res = this.lookAhead(this.index, regularDefinitionRegex, this.lexeme);

        if (res && res.index > this.index) {

            this.lexeme = res.lexeme;
            this.index = res.index;
        }

        // save result to the symbol table except white spaces
        if(ruleName !== 'ws') {
            this.updateSymbolTable();
        }
        else{
            // handle line numbers
            const newLines = this.lexeme.match(/[\n\r]/g);
            this.lineNumber += newLines ? newLines.length : 0;
        }

        // add result to the tokens list for debug
        this.updateDebug();

        // reset the lexeme
        this.lexeme = '';

        return true;
    }

    /**
     * failure
     */
    fail(){
        this.mode = 1; // 0 - in progress, 1 - error, 2 - success
        console.log('Error in line #', this.lineNumber);
    }

    /**
     * success
     */
    success(){
        this.mode = 2; // 0 - in progress, 1 - error, 2 - success
        console.log('success');
    }

    /**
     * parse next char trying to get next token
     * @return {boolean} is found?
     */
    parseNextChar(){

        const char = this.input[this.index];

        this.lexeme += char;

        // try to find appropriate regex that matches
        const rules = Object.keys(this.transitionRules);

        // loop though the rules
        for(let j=0; j<rules.length; j++){

            const ruleName = rules[j];
            this.tokenName = this.transitionRules[ruleName];

            if(this.lexeme === '/*'){
                return this.handleComment();
            }

            // constant token
            if(this.lexeme === ruleName){
                this.handleConstant();
                return true;
            }

            // regex rule
            if(this.handleRegexRule(ruleName)){
                return true;
            }
        }

        return false;
    }

    /**
     * get next token
     * @return {object|null} is token found?
     */
    getNextToken(){
        let token = null;

        while((token === null) && this.index < this.input.length){
            if(this.parseNextChar()){
                token = this.debug[this.debug.length-1];
            }
            this.index++;
        }

        return token;
    }

    /**
     * find all tokens
     */
    getAllTokens(){
        this.debug = [];
        this.symbolTable = [];
        this.index = 0;

        // loop over the input char by char
        // TODO: fix this, it should read N chars and use 2 parts buffer
        while(this.index < this.input.length){
            //const ruleFound = this.parseNextChar();
            this.getNextToken();
            //this.index++;
        }

        if(this.lexeme){
            this.fail();
        }
        else{
            this.success();
        }
    }
 }


 const input = `
 int max( i, j ) 
 /* return maximum of integers */
 {
  return i>j?i:j;
 }
 `;

 const errorInput = `
 int max( i, j ) 
 /* return maximum of integers 
 {
  return i>j?i:j;
 }
 `;

 /**
 * error

 const lex = new Lexer(errorInput);
 console.log('The input is:', errorInput);
 lex.getAllTokens();
 console.log('debug', lex.debug);
 console.log('symbol table', lex.symbolTable);
 console.log('------------------------------');
 */

 /**
 * success

 const lex1 = new Lexer(input);
 console.log('The input is:', input);
 lex1.getAllTokens();
 console.log('debug', lex1.debug);
 console.log('symbol table', lex1.symbolTable);
 console.log('------------------------------'); */
diff --git a/test.html b/test.html
 <!doctype html>
 <html>
    <head>
        <meta charset="utf-8">
        <meta name="viewport" content="width=device-width">
        <title>QUnit Example</title>
        <link rel="stylesheet" href="https://code.jquery.com/qunit/qunit-2.9.2.css">
    </head>
    <body>
        <div id="qunit"></div>
        <div id="qunit-fixture"></div>

        <script src="https://code.jquery.com/qunit/qunit-2.9.2.js"></script>
        <script src="lex.js"></script>
        <script src="tests.js"></script>
    </body>
 </html>
diff --git a/tests.js b/tests.js
 (() => {
    const input = `
        int max( i, j ) 
        /* return maximum of integers */
        {
          return i>j?i:j;
        }
    `;

    const errorInput = `
        int max( i, j ) 
        /* return maximum of integers 
        {
          return i>j?i:j;
        }
    `;

    QUnit.test('Input is successfully parsed', (assert) => {
        const lex = new Lexer(input);
        lex.getAllTokens();
        assert.ok(lex.mode === 2, 'Successfully parsed');
    });

    QUnit.test('Input has error', (assert) => {
        const lex = new Lexer(errorInput);
        lex.getAllTokens();
        assert.ok(lex.mode === 1, 'Error');
    });

    QUnit.test('First token should be whitespace', (assert) => {
        const lex = new Lexer(input);
        const token = lex.getNextToken();
        assert.ok(token.token === 'WHITESPACE', `First token should be whitespace, but it's: ${token.token}`);
    });

    QUnit.test('Second token & lexeme', (assert) => {
        const lex = new Lexer(input);
        lex.getNextToken();
        const token = lex.getNextToken();
        assert.ok(token.token === 'ID' && token.lexeme === 'int', `token = ${token.token}, lexeme = ${token.lexeme}`);
    });

    QUnit.test('Third token should be whitespace', (assert) => {
        const lex = new Lexer(input);
        lex.getNextToken();
        lex.getNextToken();
        const token = lex.getNextToken();
        assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`);
    });

    QUnit.test('4th token and lexeme', (assert) => {
        const lex = new Lexer(input);
        lex.getNextToken();
        lex.getNextToken();
        lex.getNextToken();
        const token = lex.getNextToken();
        assert.ok(token.token === 'ID' && token.lexeme === 'max', `token = ${token.token}, lexeme = ${token.lexeme}`);
    });

    QUnit.test('5th token and lexeme', (assert) => {
        const lex = new Lexer(input);
        lex.getNextToken();
        lex.getNextToken();
        lex.getNextToken();
        lex.getNextToken();
        const token = lex.getNextToken();
        assert.ok(token.token === 'PARENTHESES' && token.lexeme === '(', `token = ${token.token}, lexeme = ${token.lexeme}`);
    });

    QUnit.test('6th token should be whitespace', (assert) => {
        const lex = new Lexer(input);
        lex.getNextToken();
        lex.getNextToken();
        lex.getNextToken();
        lex.getNextToken();
        lex.getNextToken();
        const token = lex.getNextToken();
        assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`);
    });
 })();
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<title>Lexical Analysis</title>
	</head>
	<body>
	<script src="lex.js"></script>
	</body>
	</html>
	class Lexer{

	/**
	* @param {string} input - input to find tokens in; TODO: rename it to request?
	*/
	constructor(input) {

	// input to find tokens in; TODO: rename it to request?
	this.input = input;

	//symbol table to keep lexemes
	this.symbolTable = [];
	this.debug = [];

	// regular definitions
	this.regularDefinitions = {

	// A whitespace (\s) in a regular expression matches any one of the characters: space, formfeed (\f), newline (\n), return (\r), tab (\t), vertical tab (\v), non-breaking space (\xA0), as well as the Unicode characters \u00A0 \u2028 \u2029.
	ws: /^\s*$/

	//,letter: /[A-Za-z]/ // used in id
	//,digit: /\d/ // used in id
	,id: /^[A-Za-z]([A-Za-z]\|\d)*$/ // {letter}({letter}\|{digit}) //

	/*
	,get id() {
	return new RegExp(`^${this.letter}(${this.letter}\|\\d)*`);
	}*/

	,number: /\d+(\.\d+)?/g // {digit}+(\.{digit+})?

	};

	this.transitionRules = {

	ws: 'WHITESPACE',
	'/*': 'COMMENT',

	'<=': 'RELOP',
	'<': 'RELOP',
	'==': 'RELOP',
	'!=': 'RELOP',
	'>=': 'RELOP',
	'>': 'RELOP',

	'(': 'PARENTHESES',
	')': 'PARENTHESES',
	'{': 'PARENTHESES',
	'}': 'PARENTHESES',

	':': 'PUNCTUATION',
	';': 'PUNCTUATION',
	',': 'PUNCTUATION',
	'?': 'PUNCTUATION',

	id: 'ID',
	number: 'NUMBER'

	// OPERATORS: *, +, -, \, ^, ...
	};

	// current lexeme and token name
	this.lexeme = '';
	this.tokenName = '';
	this.lineNumber = 0;

	this.index = 0;
	this.mode = 0; // 0 - in progress, 1 - error, 2 - success
	}

	/**
	* try to look next symbol
	* @param {number} nextIndex
	* @return {string\|null}
	*/
	getNextChar(nextIndex){
	return nextIndex < this.input.length ? this.input[nextIndex] : null;
	}

	/**
	* search for ending of the comment
	*/
	lookAheadCommentEnd(initialIndex){

	// find substring [initialIndex, end]
	const subs = this.input.substring(initialIndex);

	// find index of */ in [initialIndex, end]
	const newIndex = subs.indexOf('*/') + 1;

	return {
	lexeme: newIndex !== -1 ? subs.substring(0, newIndex + 1) : '',
	index: newIndex !== -1 ? initialIndex + newIndex : -1
	};
	}

	/**
	* look ahead, recursive
	* @param {number} initialIndex
	* @param {RegExp} regularDefinitionRegex
	* @param {string} lexeme
	* @return {object} {lexeme, index}
	*/
	lookAhead(initialIndex, regularDefinitionRegex, lexeme){

	const nextChar = this.getNextChar(initialIndex + 1);

	if(nextChar === null \|\| !regularDefinitionRegex.test(lexeme + nextChar)){

	return {
	lexeme,
	index: initialIndex
	};
	}

	return this.lookAhead(initialIndex + 1, regularDefinitionRegex, lexeme + nextChar);
	}

	/**
	* add current token and lexeme to the symbol table
	*/
	updateSymbolTable(){
	this.symbolTable.push({
	token: this.tokenName,
	lexeme: this.lexeme,
	lineNumber: this.lineNumber
	});
	}

	/**
	* add current token and lexeme to the debug table
	*/
	updateDebug(){
	this.debug.push({
	token: this.tokenName,
	lexeme: this.lexeme,
	lineNumber: this.lineNumber
	});
	}

	/**
	* handle comment like /* ... *\/
	* @return {boolean} true if comment has finished
	*/
	handleComment(){

	const res = this.lookAheadCommentEnd(this.index-2);
	const isFinished = res && res.index > this.index;

	if (!isFinished) return isFinished;

	this.lexeme = res.lexeme;
	this.index = res.index;

	// add result to the tokens list for debug
	this.updateDebug();

	// reset the lexeme
	this.lexeme = '';

	return true;
	}

	/**
	* handle constant like >, <, ...
	*/
	handleConstant(){

	// save result to the symbol and debug table
	this.updateSymbolTable();
	this.updateDebug();

	// reset the lexeme
	this.lexeme = '';
	}

	/**
	* handle regex rule
	* @param {string} ruleName
	* @return {boolean} true if comment has found
	*/
	handleRegexRule(ruleName){

	const regularDefinitionRegex = this.regularDefinitions[ruleName];
	const isFound = regularDefinitionRegex && regularDefinitionRegex.test(this.lexeme);
	if(!isFound) return isFound;

	const res = this.lookAhead(this.index, regularDefinitionRegex, this.lexeme);

	if (res && res.index > this.index) {

	this.lexeme = res.lexeme;
	this.index = res.index;
	}

	// save result to the symbol table except white spaces
	if(ruleName !== 'ws') {
	this.updateSymbolTable();
	}
	else{
	// handle line numbers
	const newLines = this.lexeme.match(/[\n\r]/g);
	this.lineNumber += newLines ? newLines.length : 0;
	}

	// add result to the tokens list for debug
	this.updateDebug();

	// reset the lexeme
	this.lexeme = '';

	return true;
	}

	/**
	* failure
	*/
	fail(){
	this.mode = 1; // 0 - in progress, 1 - error, 2 - success
	console.log('Error in line #', this.lineNumber);
	}

	/**
	* success
	*/
	success(){
	this.mode = 2; // 0 - in progress, 1 - error, 2 - success
	console.log('success');
	}

	/**
	* parse next char trying to get next token
	* @return {boolean} is found?
	*/
	parseNextChar(){

	const char = this.input[this.index];

	this.lexeme += char;

	// try to find appropriate regex that matches
	const rules = Object.keys(this.transitionRules);

	// loop though the rules
	for(let j=0; j<rules.length; j++){

	const ruleName = rules[j];
	this.tokenName = this.transitionRules[ruleName];

	if(this.lexeme === '/*'){
	return this.handleComment();
	}

	// constant token
	if(this.lexeme === ruleName){
	this.handleConstant();
	return true;
	}

	// regex rule
	if(this.handleRegexRule(ruleName)){
	return true;
	}
	}

	return false;
	}

	/**
	* get next token
	* @return {object\|null} is token found?
	*/
	getNextToken(){
	let token = null;

	while((token === null) && this.index < this.input.length){
	if(this.parseNextChar()){
	token = this.debug[this.debug.length-1];
	}
	this.index++;
	}

	return token;
	}

	/**
	* find all tokens
	*/
	getAllTokens(){
	this.debug = [];
	this.symbolTable = [];
	this.index = 0;

	// loop over the input char by char
	// TODO: fix this, it should read N chars and use 2 parts buffer
	while(this.index < this.input.length){
	//const ruleFound = this.parseNextChar();
	this.getNextToken();
	//this.index++;
	}

	if(this.lexeme){
	this.fail();
	}
	else{
	this.success();
	}
	}
	}


	const input = `
	int max( i, j )
	/* return maximum of integers */
	{
	return i>j?i:j;
	}
	`;

	const errorInput = `
	int max( i, j )
	/* return maximum of integers
	{
	return i>j?i:j;
	}
	`;

	/**
	* error

	const lex = new Lexer(errorInput);
	console.log('The input is:', errorInput);
	lex.getAllTokens();
	console.log('debug', lex.debug);
	console.log('symbol table', lex.symbolTable);
	console.log('------------------------------');
	*/

	/**
	* success

	const lex1 = new Lexer(input);
	console.log('The input is:', input);
	lex1.getAllTokens();
	console.log('debug', lex1.debug);
	console.log('symbol table', lex1.symbolTable);
	console.log('------------------------------'); */
	<!doctype html>
	<html>
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width">
	<title>QUnit Example</title>
	<link rel="stylesheet" href="https://code.jquery.com/qunit/qunit-2.9.2.css">
	</head>
	<body>
	<div id="qunit"></div>
	<div id="qunit-fixture"></div>

	<script src="https://code.jquery.com/qunit/qunit-2.9.2.js"></script>
	<script src="lex.js"></script>
	<script src="tests.js"></script>
	</body>
	</html>
	(() => {
	const input = `
	int max( i, j )
	/* return maximum of integers */
	{
	return i>j?i:j;
	}
	`;

	const errorInput = `
	int max( i, j )
	/* return maximum of integers
	{
	return i>j?i:j;
	}
	`;

	QUnit.test('Input is successfully parsed', (assert) => {
	const lex = new Lexer(input);
	lex.getAllTokens();
	assert.ok(lex.mode === 2, 'Successfully parsed');
	});

	QUnit.test('Input has error', (assert) => {
	const lex = new Lexer(errorInput);
	lex.getAllTokens();
	assert.ok(lex.mode === 1, 'Error');
	});

	QUnit.test('First token should be whitespace', (assert) => {
	const lex = new Lexer(input);
	const token = lex.getNextToken();
	assert.ok(token.token === 'WHITESPACE', `First token should be whitespace, but it's: ${token.token}`);
	});

	QUnit.test('Second token & lexeme', (assert) => {
	const lex = new Lexer(input);
	lex.getNextToken();
	const token = lex.getNextToken();
	assert.ok(token.token === 'ID' && token.lexeme === 'int', `token = ${token.token}, lexeme = ${token.lexeme}`);
	});

	QUnit.test('Third token should be whitespace', (assert) => {
	const lex = new Lexer(input);
	lex.getNextToken();
	lex.getNextToken();
	const token = lex.getNextToken();
	assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`);
	});

	QUnit.test('4th token and lexeme', (assert) => {
	const lex = new Lexer(input);
	lex.getNextToken();
	lex.getNextToken();
	lex.getNextToken();
	const token = lex.getNextToken();
	assert.ok(token.token === 'ID' && token.lexeme === 'max', `token = ${token.token}, lexeme = ${token.lexeme}`);
	});

	QUnit.test('5th token and lexeme', (assert) => {
	const lex = new Lexer(input);
	lex.getNextToken();
	lex.getNextToken();
	lex.getNextToken();
	lex.getNextToken();
	const token = lex.getNextToken();
	assert.ok(token.token === 'PARENTHESES' && token.lexeme === '(', `token = ${token.token}, lexeme = ${token.lexeme}`);
	});

	QUnit.test('6th token should be whitespace', (assert) => {
	const lex = new Lexer(input);
	lex.getNextToken();
	lex.getNextToken();
	lex.getNextToken();
	lex.getNextToken();
	lex.getNextToken();
	const token = lex.getNextToken();
	assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`);
	});
	})();