Last active
October 5, 2019 16:17
-
-
Save 1rosehip/7cc015933942c11b9829ef1f8048491e to your computer and use it in GitHub Desktop.
LexJS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<title>Lexical Analysis</title> | |
</head> | |
<body> | |
<script src="lex.js"></script> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Lexer{ | |
/** | |
* @param {string} input - input to find tokens in; TODO: rename it to request? | |
*/ | |
constructor(input) { | |
// input to find tokens in; TODO: rename it to request? | |
this.input = input; | |
//symbol table to keep lexemes | |
this.symbolTable = []; | |
this.debug = []; | |
// regular definitions | |
this.regularDefinitions = { | |
// A whitespace (\s) in a regular expression matches any one of the characters: space, formfeed (\f), newline (\n), return (\r), tab (\t), vertical tab (\v), non-breaking space (\xA0), as well as the Unicode characters \u00A0 \u2028 \u2029. | |
ws: /^\s*$/ | |
//,letter: /[A-Za-z]/ // used in id | |
//,digit: /\d/ // used in id | |
,id: /^[A-Za-z]([A-Za-z]|\d)*$/ // {letter}({letter}|{digit}) // | |
/* | |
,get id() { | |
return new RegExp(`^${this.letter}(${this.letter}|\\d)*`); | |
}*/ | |
,number: /\d+(\.\d+)?/g // {digit}+(\.{digit+})? | |
}; | |
this.transitionRules = { | |
ws: 'WHITESPACE', | |
'/*': 'COMMENT', | |
'<=': 'RELOP', | |
'<': 'RELOP', | |
'==': 'RELOP', | |
'!=': 'RELOP', | |
'>=': 'RELOP', | |
'>': 'RELOP', | |
'(': 'PARENTHESES', | |
')': 'PARENTHESES', | |
'{': 'PARENTHESES', | |
'}': 'PARENTHESES', | |
':': 'PUNCTUATION', | |
';': 'PUNCTUATION', | |
',': 'PUNCTUATION', | |
'?': 'PUNCTUATION', | |
id: 'ID', | |
number: 'NUMBER' | |
// OPERATORS: *, +, -, \, ^, ... | |
}; | |
// current lexeme and token name | |
this.lexeme = ''; | |
this.tokenName = ''; | |
this.lineNumber = 0; | |
this.index = 0; | |
this.mode = 0; // 0 - in progress, 1 - error, 2 - success | |
} | |
/** | |
* try to look next symbol | |
* @param {number} nextIndex | |
* @return {string|null} | |
*/ | |
getNextChar(nextIndex){ | |
return nextIndex < this.input.length ? this.input[nextIndex] : null; | |
} | |
/** | |
* search for ending of the comment | |
*/ | |
lookAheadCommentEnd(initialIndex){ | |
// find substring [initialIndex, end] | |
const subs = this.input.substring(initialIndex); | |
// find index of */ in [initialIndex, end] | |
const newIndex = subs.indexOf('*/') + 1; | |
return { | |
lexeme: newIndex !== -1 ? subs.substring(0, newIndex + 1) : '', | |
index: newIndex !== -1 ? initialIndex + newIndex : -1 | |
}; | |
} | |
/** | |
* look ahead, recursive | |
* @param {number} initialIndex | |
* @param {RegExp} regularDefinitionRegex | |
* @param {string} lexeme | |
* @return {object} {lexeme, index} | |
*/ | |
lookAhead(initialIndex, regularDefinitionRegex, lexeme){ | |
const nextChar = this.getNextChar(initialIndex + 1); | |
if(nextChar === null || !regularDefinitionRegex.test(lexeme + nextChar)){ | |
return { | |
lexeme, | |
index: initialIndex | |
}; | |
} | |
return this.lookAhead(initialIndex + 1, regularDefinitionRegex, lexeme + nextChar); | |
} | |
/** | |
* add current token and lexeme to the symbol table | |
*/ | |
updateSymbolTable(){ | |
this.symbolTable.push({ | |
token: this.tokenName, | |
lexeme: this.lexeme, | |
lineNumber: this.lineNumber | |
}); | |
} | |
/** | |
* add current token and lexeme to the debug table | |
*/ | |
updateDebug(){ | |
this.debug.push({ | |
token: this.tokenName, | |
lexeme: this.lexeme, | |
lineNumber: this.lineNumber | |
}); | |
} | |
/** | |
* handle comment like /* ... *\/ | |
* @return {boolean} true if comment has finished | |
*/ | |
handleComment(){ | |
const res = this.lookAheadCommentEnd(this.index-2); | |
const isFinished = res && res.index > this.index; | |
if (!isFinished) return isFinished; | |
this.lexeme = res.lexeme; | |
this.index = res.index; | |
// add result to the tokens list for debug | |
this.updateDebug(); | |
// reset the lexeme | |
this.lexeme = ''; | |
return true; | |
} | |
/** | |
* handle constant like >, <, ... | |
*/ | |
handleConstant(){ | |
// save result to the symbol and debug table | |
this.updateSymbolTable(); | |
this.updateDebug(); | |
// reset the lexeme | |
this.lexeme = ''; | |
} | |
/** | |
* handle regex rule | |
* @param {string} ruleName | |
* @return {boolean} true if comment has found | |
*/ | |
handleRegexRule(ruleName){ | |
const regularDefinitionRegex = this.regularDefinitions[ruleName]; | |
const isFound = regularDefinitionRegex && regularDefinitionRegex.test(this.lexeme); | |
if(!isFound) return isFound; | |
const res = this.lookAhead(this.index, regularDefinitionRegex, this.lexeme); | |
if (res && res.index > this.index) { | |
this.lexeme = res.lexeme; | |
this.index = res.index; | |
} | |
// save result to the symbol table except white spaces | |
if(ruleName !== 'ws') { | |
this.updateSymbolTable(); | |
} | |
else{ | |
// handle line numbers | |
const newLines = this.lexeme.match(/[\n\r]/g); | |
this.lineNumber += newLines ? newLines.length : 0; | |
} | |
// add result to the tokens list for debug | |
this.updateDebug(); | |
// reset the lexeme | |
this.lexeme = ''; | |
return true; | |
} | |
/** | |
* failure | |
*/ | |
fail(){ | |
this.mode = 1; // 0 - in progress, 1 - error, 2 - success | |
console.log('Error in line #', this.lineNumber); | |
} | |
/** | |
* success | |
*/ | |
success(){ | |
this.mode = 2; // 0 - in progress, 1 - error, 2 - success | |
console.log('success'); | |
} | |
/** | |
* parse next char trying to get next token | |
* @return {boolean} is found? | |
*/ | |
parseNextChar(){ | |
const char = this.input[this.index]; | |
this.lexeme += char; | |
// try to find appropriate regex that matches | |
const rules = Object.keys(this.transitionRules); | |
// loop though the rules | |
for(let j=0; j<rules.length; j++){ | |
const ruleName = rules[j]; | |
this.tokenName = this.transitionRules[ruleName]; | |
if(this.lexeme === '/*'){ | |
return this.handleComment(); | |
} | |
// constant token | |
if(this.lexeme === ruleName){ | |
this.handleConstant(); | |
return true; | |
} | |
// regex rule | |
if(this.handleRegexRule(ruleName)){ | |
return true; | |
} | |
} | |
return false; | |
} | |
/** | |
* get next token | |
* @return {object|null} is token found? | |
*/ | |
getNextToken(){ | |
let token = null; | |
while((token === null) && this.index < this.input.length){ | |
if(this.parseNextChar()){ | |
token = this.debug[this.debug.length-1]; | |
} | |
this.index++; | |
} | |
return token; | |
} | |
/** | |
* find all tokens | |
*/ | |
getAllTokens(){ | |
this.debug = []; | |
this.symbolTable = []; | |
this.index = 0; | |
// loop over the input char by char | |
// TODO: fix this, it should read N chars and use 2 parts buffer | |
while(this.index < this.input.length){ | |
//const ruleFound = this.parseNextChar(); | |
this.getNextToken(); | |
//this.index++; | |
} | |
if(this.lexeme){ | |
this.fail(); | |
} | |
else{ | |
this.success(); | |
} | |
} | |
} | |
const input = ` | |
int max( i, j ) | |
/* return maximum of integers */ | |
{ | |
return i>j?i:j; | |
} | |
`; | |
const errorInput = ` | |
int max( i, j ) | |
/* return maximum of integers | |
{ | |
return i>j?i:j; | |
} | |
`; | |
/** | |
* error | |
const lex = new Lexer(errorInput); | |
console.log('The input is:', errorInput); | |
lex.getAllTokens(); | |
console.log('debug', lex.debug); | |
console.log('symbol table', lex.symbolTable); | |
console.log('------------------------------'); | |
*/ | |
/** | |
* success | |
const lex1 = new Lexer(input); | |
console.log('The input is:', input); | |
lex1.getAllTokens(); | |
console.log('debug', lex1.debug); | |
console.log('symbol table', lex1.symbolTable); | |
console.log('------------------------------'); */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!doctype html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<meta name="viewport" content="width=device-width"> | |
<title>QUnit Example</title> | |
<link rel="stylesheet" href="https://code.jquery.com/qunit/qunit-2.9.2.css"> | |
</head> | |
<body> | |
<div id="qunit"></div> | |
<div id="qunit-fixture"></div> | |
<script src="https://code.jquery.com/qunit/qunit-2.9.2.js"></script> | |
<script src="lex.js"></script> | |
<script src="tests.js"></script> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(() => { | |
const input = ` | |
int max( i, j ) | |
/* return maximum of integers */ | |
{ | |
return i>j?i:j; | |
} | |
`; | |
const errorInput = ` | |
int max( i, j ) | |
/* return maximum of integers | |
{ | |
return i>j?i:j; | |
} | |
`; | |
QUnit.test('Input is successfully parsed', (assert) => { | |
const lex = new Lexer(input); | |
lex.getAllTokens(); | |
assert.ok(lex.mode === 2, 'Successfully parsed'); | |
}); | |
QUnit.test('Input has error', (assert) => { | |
const lex = new Lexer(errorInput); | |
lex.getAllTokens(); | |
assert.ok(lex.mode === 1, 'Error'); | |
}); | |
QUnit.test('First token should be whitespace', (assert) => { | |
const lex = new Lexer(input); | |
const token = lex.getNextToken(); | |
assert.ok(token.token === 'WHITESPACE', `First token should be whitespace, but it's: ${token.token}`); | |
}); | |
QUnit.test('Second token & lexeme', (assert) => { | |
const lex = new Lexer(input); | |
lex.getNextToken(); | |
const token = lex.getNextToken(); | |
assert.ok(token.token === 'ID' && token.lexeme === 'int', `token = ${token.token}, lexeme = ${token.lexeme}`); | |
}); | |
QUnit.test('Third token should be whitespace', (assert) => { | |
const lex = new Lexer(input); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
const token = lex.getNextToken(); | |
assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`); | |
}); | |
QUnit.test('4th token and lexeme', (assert) => { | |
const lex = new Lexer(input); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
const token = lex.getNextToken(); | |
assert.ok(token.token === 'ID' && token.lexeme === 'max', `token = ${token.token}, lexeme = ${token.lexeme}`); | |
}); | |
QUnit.test('5th token and lexeme', (assert) => { | |
const lex = new Lexer(input); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
const token = lex.getNextToken(); | |
assert.ok(token.token === 'PARENTHESES' && token.lexeme === '(', `token = ${token.token}, lexeme = ${token.lexeme}`); | |
}); | |
QUnit.test('6th token should be whitespace', (assert) => { | |
const lex = new Lexer(input); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
lex.getNextToken(); | |
const token = lex.getNextToken(); | |
assert.ok(token.token === 'WHITESPACE', `Third token should be whitespace, but it's: ${token.token}`); | |
}); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment