Skip to content

Instantly share code, notes, and snippets.

@1rosehip
Last active October 5, 2019 16:17
Show Gist options
  • Save 1rosehip/7cc015933942c11b9829ef1f8048491e to your computer and use it in GitHub Desktop.
Save 1rosehip/7cc015933942c11b9829ef1f8048491e to your computer and use it in GitHub Desktop.
LexJS
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Lexical Analysis</title>
</head>
<body>
<script src="lex.js"></script>
</body>
</html>
class Lexer{
/**
* @param {string} input - input to find tokens in; TODO: rename it to request?
*/
constructor(input) {
// input to find tokens in; TODO: rename it to request?
this.input = input;
//symbol table to keep lexemes
this.symbolTable = [];
this.debug = [];
// regular definitions
this.regularDefinitions = {
// A whitespace (\s) in a regular expression matches any one of the characters: space, formfeed (\f), newline (\n), return (\r), tab (\t), vertical tab (\v), non-breaking space (\xA0), as well as the Unicode characters \u00A0 \u2028 \u2029.
ws: /^\s*$/
//,letter: /[A-Za-z]/ // used in id
//,digit: /\d/ // used in id
,id: /^[A-Za-z]([A-Za-z]|\d)*$/ // {letter}({letter}|{digit}) //
/*
,get id() {
return new RegExp(`^${this.letter}(${this.letter}|\\d)*`);
}*/
,number: /\d+(\.\d+)?/g // {digit}+(\.{digit+})?
};
this.transitionRules = {
ws: 'WHITESPACE',
'/*': 'COMMENT',
'<=': 'RELOP',
'<': 'RELOP',
'==': 'RELOP',
'!=': 'RELOP',
'>=': 'RELOP',
'>': 'RELOP',
'(': 'PARENTHESES',
')': 'PARENTHESES',
'{': 'PARENTHESES',
'}': 'PARENTHESES',
':': 'PUNCTUATION',
';': 'PUNCTUATION',
',': 'PUNCTUATION',
'?': 'PUNCTUATION',
id: 'ID',
number: 'NUMBER'
// OPERATORS: *, +, -, \, ^, ...
};
// current lexeme and token name
this.lexeme = '';
this.tokenName = '';
this.lineNumber = 0;
this.index = 0;
this.mode = 0; // 0 - in progress, 1 - error, 2 - success
}
/**
* try to look next symbol
* @param {number} nextIndex
* @return {string|null}
*/
getNextChar(nextIndex){
return nextIndex < this.input.length ? this.input[nextIndex] : null;
}
/**
* search for ending of the comment
*/
lookAheadCommentEnd(initialIndex){
// find substring [initialIndex, end]
const subs = this.input.substring(initialIndex);
// find index of */ in [initialIndex, end]
const newIndex = subs.indexOf('*/') + 1;
return {
lexeme: newIndex !== -1 ? subs.substring(0, newIndex + 1) : '',
index: newIndex !== -1 ? initialIndex + newIndex : -1
};
}
/**
* look ahead, recursive
* @param {number} initialIndex
* @param {RegExp} regularDefinitionRegex
* @param {string} lexeme
* @return {object} {lexeme, index}
*/
lookAhead(initialIndex, regularDefinitionRegex, lexeme){
const nextChar = this.getNextChar(initialIndex + 1);
if(nextChar === null || !regularDefinitionRegex.test(lexeme + nextChar)){
return {
lexeme,
index: initialIndex
};
}
return this.lookAhead(initialIndex + 1, regularDefinitionRegex, lexeme + nextChar);
}
/**
* add current token and lexeme to the symbol table
*/
updateSymbolTable(){
this.symbolTable.push({
token: this.tokenName,
lexeme: this.lexeme,
lineNumber: this.lineNumber
});
}
/**
* add current token and lexeme to the debug table
*/
updateDebug(){
this.debug.push({
token: this.tokenName,
lexeme: this.lexeme,
lineNumber: this.lineNumber
});
}
/**
* handle comment like /* ... *\/
* @return {boolean} true if comment has finished
*/
handleComment(){
const res = this.lookAheadCommentEnd(this.index-2);
const isFinished = res && res.index > this.index;
if (!isFinished) return isFinished;
this.lexeme = res.lexeme;
this.index = res.index;
// add result to the tokens list for debug
this.updateDebug();
// reset the lexeme
this.lexeme = '';
return true;
}
/**
* handle constant like >, <, ...
*/
handleConstant(){
// save result to the symbol and debug table
this.updateSymbolTable();
this.updateDebug();
// reset the lexeme
this.lexeme = '';
}
/**
* handle regex rule
* @param {string} ruleName
* @return {boolean} true if comment has found
*/
handleRegexRule(ruleName){
const regularDefinitionRegex = this.regularDefinitions[ruleName];
const isFound = regularDefinitionRegex && regularDefinitionRegex.test(this.lexeme);
if(!isFound) return isFound;
const res = this.lookAhead(this.index, regularDefinitionRegex, this.lexeme);
if (res && res.index > this.index) {
this.lexeme = res.lexeme;
this.index = res.index;
}
// save result to the symbol table except white spaces
if(ruleName !== 'ws') {
this.updateSymbolTable();
}
else{
// handle line numbers
const newLines = this.lexeme.match(/[\n\r]/g);
this.lineNumber += newLines ? newLines.length : 0;
}
// add result to the tokens list for debug
this.updateDebug();
// reset the lexeme
this.lexeme = '';
return true;
}
/**
* failure
*/
fail(){
this.mode = 1; // 0 - in progress, 1 - error, 2 - success
console.log('Error in line #', this.lineNumber);
}
/**
* success
*/
success(){
this.mode = 2; // 0 - in progress, 1 - error, 2 - success
console.log('success');
}
/**
* parse next char trying to get next token
* @return {boolean} is found?
*/
parseNextChar(){
const char = this.input[this.index];
this.lexeme += char;
// try to find appropriate regex that matches
const rules = Object.keys(this.transitionRules);
// loop though the rules
for(let j=0; j<rules.length; j++){
const ruleName = rules[j];
this.tokenName = this.transitionRules[ruleName];
if(this.lexeme === '/*'){
return this.handleComment();
}
// constant token
if(this.lexeme === ruleName){
this.handleConstant();
return true;
}
// regex rule
if(this.handleRegexRule(ruleName)){
return true;
}
}
return false;
}
/**
* get next token
* @return {object|null} is token found?
*/
getNextToken(){
let token = null;
while((token === null) && this.index < this.input.length){
if(this.parseNextChar()){
token = this.debug[this.debug.length-1];
}
this.index++;
}
return token;
}
/**
* find all tokens
*/
getAllTokens(){
this.debug = [];
this.symbolTable = [];
this.index = 0;
// loop over the input char by char
// TODO: fix this, it should read N chars and use 2 parts buffer
while(this.index < this.input.length){
//const ruleFound = this.parseNextChar();
this.getNextToken();
//this.index++;
}
if(this.lexeme){
this.fail();
}
else{
this.success();
}
}
}
const input = `
int max( i, j )
/* return maximum of integers */
{
return i>j?i:j;
}
`;
const errorInput = `
int max( i, j )
/* return maximum of integers
{
return i>j?i:j;
}
`;
/**
* error
const lex = new Lexer(errorInput);
console.log('The input is:', errorInput);
lex.getAllTokens();
console.log('debug', lex.debug);
console.log('symbol table', lex.symbolTable);
console.log('------------------------------');
*/
/**
* success
const lex1 = new Lexer(input);
console.log('The input is:', input);
lex1.getAllTokens();
console.log('debug', lex1.debug);
console.log('symbol table', lex1.symbolTable);
console.log('------------------------------'); */
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width">
<title>QUnit Example</title>
<link rel="stylesheet" href="https://code.jquery.com/qunit/qunit-2.9.2.css">
</head>
<body>
<div id="qunit"></div>
<div id="qunit-fixture"></div>
<script src="https://code.jquery.com/qunit/qunit-2.9.2.js"></script>
<script src="lex.js"></script>
<script src="tests.js"></script>
</body>
</html>
(() => {
const input = `
int max( i, j )
/* return maximum of integers */
{
return i>j?i:j;
}
`;
const errorInput = `
int max( i, j )
/* return maximum of integers
{
return i>j?i:j;
}
`;
QUnit.test('Input is successfully parsed', (assert) => {
const lex = new Lexer(input);
lex.getAllTokens();
assert.ok(lex.mode === 2, 'Successfully parsed');
});
QUnit.test('Input has error', (assert) => {
const lex = new Lexer(errorInput);
lex.getAllTokens();
assert.ok(lex.mode === 1, 'Error');
});
QUnit.test('First token should be whitespace', (assert) => {
const lex = new Lexer(input);
const token = lex.getNextToken();
assert.ok(token.token === 'WHITESPACE', `First token should be whitespace, but it's: ${token.token}`);
});
QUnit.test('Second token & lexeme', (assert) => {
const lex = new Lexer(input);
lex.getNextToken();
const token = lex.getNextToken();
assert.ok(token.token === 'ID' && token.lexeme === 'int', `token = ${token.token}, lexeme = ${token.lexeme}`);
});
QUnit.test('Third token should be whitespace', (assert) => {
const lex = new Lexer(input);
lex.getNextToken();
lex.getNextToken();
const token = lex.getNextToken();
assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`);
});
QUnit.test('4th token and lexeme', (assert) => {
const lex = new Lexer(input);
lex.getNextToken();
lex.getNextToken();
lex.getNextToken();
const token = lex.getNextToken();
assert.ok(token.token === 'ID' && token.lexeme === 'max', `token = ${token.token}, lexeme = ${token.lexeme}`);
});
QUnit.test('5th token and lexeme', (assert) => {
const lex = new Lexer(input);
lex.getNextToken();
lex.getNextToken();
lex.getNextToken();
lex.getNextToken();
const token = lex.getNextToken();
assert.ok(token.token === 'PARENTHESES' && token.lexeme === '(', `token = ${token.token}, lexeme = ${token.lexeme}`);
});
QUnit.test('6th token should be whitespace', (assert) => {
const lex = new Lexer(input);
lex.getNextToken();
lex.getNextToken();
lex.getNextToken();
lex.getNextToken();
lex.getNextToken();
const token = lex.getNextToken();
assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`);
});
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment