Last active
March 7, 2019 14:06
-
-
Save nixorn/a8f1acb9c957f513e6072c0fcde8141b to your computer and use it in GitHub Desktop.
Sample of chevrotain based haskell lexer with indent support (poorly tested)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta content="text/html; charset=utf-8" http-equiv="content-type"> | |
</head> | |
<body> | |
<div> | |
Input: | |
<textarea id="in" name="input" cols="40" rows="3""></textarea> | |
<button onclick="lex()"> Parse </button> | |
</div> | |
<script type="text/javascript" src="https://unpkg.com/[email protected]/lib/chevrotain.min.js"> </script> | |
<script src="https://cdn.jsdelivr.net/npm/[email protected]/lodash.min.js"></script> | |
<script type="text/javascript"> | |
function lex() { | |
const { createToken, createTokenInstance, Lexer } = chevrotain; | |
const WS = createToken({ name: 'WS',group: Lexer.SKIPPED, pattern: / +/}) | |
const VarId = createToken({ name: 'VarId', pattern: /[a-z_]\w+/}) | |
const QVarId = createToken({ name: 'QVarId', pattern: /\w+\.[a-z_]\w+/}) | |
const ConId = createToken({ name: 'ConId', pattern: /\[A-Z]w+/}) | |
const QConId = createToken({ name: 'ConId', pattern: /\w+\.[A-Z]\w+/}) | |
const VarSym = createToken({ name: 'VarSym', pattern: /[:!#%&*.\/?@$+<=>^|~\\-]+/}) | |
//const ConSym = createToken({ name: 'ConSym', pattern: /[:!#%&*.\/?@$+<=>^|~\\-]+/}) | |
const QVarSym = createToken({ name: 'QVarSym', pattern: /\w+\.[:!#%&*.\/?@$+<=>^|~\\-]+/}) | |
//const QConSym = createToken({ name: 'QConSym', pattern: /\w+\.[:!#%&*.\/?@$+<=>^|~\\-]+/}) | |
const IntTok = createToken({ name: 'IntTok', pattern: /\d+/}) | |
const FloatTok = createToken({ name: 'FloatTok', pattern: /\d+\.\d+/}) | |
const String = createToken({ name: 'String', pattern: /[\w, ]+/}) | |
const LeftParen = createToken({ name: 'LeftParen', pattern: '('}) | |
const RightParen = createToken({ name: 'RightParen', pattern: ')'}) | |
const SemiColon = createToken({ name: 'SemiColon', pattern: ';'}) | |
const LeftCurly = createToken({ name: 'LeftCurly', pattern: '{'}) | |
const RightCurly = createToken({ name: 'RightCurly', pattern: '}'}) | |
const LeftSquare = createToken({ name: 'LeftSquare', pattern: '['}) | |
const RightSquare = createToken({ name: 'RightSquare', pattern: ']'}) | |
const Comma = createToken({ name: 'Comma', pattern: ','}) | |
const Underscore = createToken({ name: 'Underscore', pattern: '_'}) | |
const BackQuote = createToken({ name: 'BackQuote', pattern: '`'}) | |
const DotDot = createToken({ name: 'DotDot', pattern: '\.\.'}) | |
const Colon = createToken({ name: 'Colon', pattern: ':'}) | |
const DoubleColon = createToken({ name: 'DoubleColon', pattern: '::'}) | |
const Equals = createToken({ name: 'Equals', pattern: '='}) | |
const Backslash = createToken({ name: 'Backslash', pattern: '\\'}) | |
const Bar = createToken({ name: 'Bar', pattern: '|'}) | |
const LeftArrow = createToken({ name: 'LeftArrow', pattern: '<-'}) | |
const RightArrow = createToken({ name: 'RightArrow', pattern: '->'}) | |
const At = createToken({ name: 'At', pattern: '@'}) | |
const Tilde = createToken({ name: 'Tilde', pattern: '~'}) | |
const DoubleArrow = createToken({ name: 'DoubleArrow', pattern: '=>'}) | |
const Minus = createToken({ name: 'Minus', pattern: '-'}) | |
const Exclamation = createToken({ name: 'Exclamation', pattern: '!'}) | |
const KW_Case = createToken({ name: 'KW_Case', pattern: 'case'}) | |
const KW_Class = createToken({ name: 'KW_Class', pattern: 'class'}) | |
const KW_Data = createToken({ name: 'KW_Data', pattern: 'data'}) | |
const KW_Default = createToken({ name: 'KW_Default', pattern: 'default'}) | |
const KW_Deriving = createToken({ name: 'KW_Deriving', pattern: 'deriving'}) | |
const KW_Do = createToken({ name: 'KW_Do', pattern: 'do'}) | |
const KW_Else = createToken({ name: 'KW_Else', pattern: 'else'}) | |
const KW_Foreign = createToken({ name: 'KW_Foreign', pattern: 'foreign'}) | |
const KW_If = createToken({ name: 'KW_If', pattern: 'if'}) | |
const KW_Import = createToken({ name: 'KW_Import', pattern: 'import'}) | |
const KW_In = createToken({ name: 'KW_In', pattern: 'in'}) | |
const KW_Infix = createToken({ name: 'KW_Infix', pattern: 'infix'}) | |
const KW_InfixL = createToken({ name: 'KW_InfixL', pattern: 'infixl'}) | |
const KW_InfixR = createToken({ name: 'KW_InfixR', pattern: 'infixr'}) | |
const KW_Instance = createToken({ name: 'KW_Instance', pattern: 'instance'}) | |
const KW_Let = createToken({ name: 'KW_Let', pattern: 'let'}) | |
const KW_Module = createToken({ name: 'KW_Module', pattern: 'module'}) | |
const KW_NewType = createToken({ name: 'KW_NewType', pattern: 'newtype'}) | |
const KW_Of = createToken({ name: 'KW_Of', pattern: 'of'}) | |
const KW_Then = createToken({ name: 'KW_Then', pattern: 'then'}) | |
const KW_Type = createToken({name: 'KW_Type', pattern: 'type'}) | |
const KW_Where = createToken({name: 'KW_Where', pattern: 'where'}) | |
const KW_As = createToken({name: 'KW_As', pattern: 'as'}) | |
const KW_Export = createToken({name: 'KW_Export', pattern: 'export'}) | |
const KW_Hiding = createToken({name: 'KW_Hiding', pattern: 'hiding'}) | |
const KW_Qualified = createToken({name: 'KW_Qualified', pattern: 'qualified'}) | |
const KW_Safe = createToken({name: 'KW_Safe', pattern: 'safe'}) | |
const KW_Unsafe = createToken({name: 'KW_Unsafe', pattern: 'unsafe'} | |
let indentStack = [0] | |
let lastOffsetChecked | |
function matchWhiteSpace(text, startOffset) { | |
let result = "" | |
let offset = startOffset | |
while (text[offset] === " ") { | |
offset++ | |
result += " " | |
} | |
if (result === "") { | |
return null | |
} | |
return [result] | |
} | |
function matchIndentBase(text, offset, matchedTokens, groups, type) { | |
const noTokensMatchedYet = _.isEmpty(matchedTokens) | |
const newLines = groups.nl | |
const noNewLinesMatchedYet = _.isEmpty(newLines) | |
const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet | |
const isStartOfLine = | |
// only newlines matched so far | |
(noTokensMatchedYet && !noNewLinesMatchedYet) || | |
// Both newlines and other Tokens have been matched AND the last matched Token is a newline | |
(!noTokensMatchedYet && | |
!noNewLinesMatchedYet && | |
(!_.isEmpty(newLines) && | |
!_.isEmpty(matchedTokens) && | |
_.last(newLines).startOffset) > | |
_.last(matchedTokens).startOffset) | |
// indentation can only be matched at the start of a line. | |
if (isFirstLine || isStartOfLine) { | |
let match | |
let currIndentLevel = undefined | |
const isZeroIndent = text.length < offset && text[offset] !== " " | |
if (isZeroIndent) { | |
// Matching zero spaces Outdent would not consume any chars, thus it would cause an infinite loop. | |
// This check prevents matching a sequence of zero spaces outdents. | |
if (lastOffsetChecked !== offset) { | |
currIndentLevel = 0 | |
match = [""] | |
lastOffsetChecked = offset | |
} | |
} else { | |
// possible non-empty indentation | |
match = matchWhiteSpace(text, offset) | |
if (match !== null) { | |
currIndentLevel = match[0].length | |
} | |
} | |
if (currIndentLevel !== undefined) { | |
const lastIndentLevel = _.last(indentStack) | |
if (currIndentLevel > lastIndentLevel && type === "indent") { | |
indentStack.push(currIndentLevel) | |
return match | |
} else if ( | |
currIndentLevel < lastIndentLevel && | |
type === "outdent" | |
) { | |
//if we need more than one outdent token, add all but the last one | |
if (indentStack.length > 2) { | |
const image = "" | |
const offset = _.last(matchedTokens).endOffset + 1 | |
const line = _.last(matchedTokens).endLine | |
const column = _.last(matchedTokens).endColumn + 1 | |
while ( | |
indentStack.length > 2 && | |
//stop before the last Outdent | |
indentStack[indentStack.length - 2] > currIndentLevel | |
) { | |
indentStack.pop() | |
matchedTokens.push( | |
createTokenInstance( | |
Outdent, | |
"", | |
NaN, | |
NaN, | |
NaN, | |
NaN, | |
NaN, | |
NaN | |
) | |
) | |
} | |
} | |
indentStack.pop() | |
return match | |
} else { | |
// same indent, this should be lexed as simple whitespace and ignored | |
return null | |
} | |
} else { | |
// indentation cannot be matched without at least one space character. | |
return null | |
} | |
} else { | |
// indentation cannot be matched under other circumstances | |
return null | |
} | |
} | |
const matchIndent = _.partialRight(matchIndentBase, "indent") | |
const matchOutdent = _.partialRight(matchIndentBase, "outdent") | |
const Newline = createToken({ | |
name: "Newline", | |
pattern: /\n|\r\n?/, | |
group: "nl" | |
}) | |
// define the indentation tokens using custom token patterns | |
const Indent = createToken({ | |
name: "Indent", | |
pattern: matchIndent, | |
// custom token patterns should explicitly specify the line_breaks option | |
line_breaks: false | |
}) | |
const Outdent = createToken({ | |
name: "Outdent", | |
pattern: matchOutdent, | |
// custom token patterns should explicitly specify the line_breaks option | |
line_breaks: false | |
}) | |
let allTokens = [ | |
Newline, Indent, Outdent, | |
WS, | |
DotDot, | |
Colon, | |
DoubleColon, | |
Equals, | |
Backslash, | |
Bar, | |
LeftArrow, | |
RightArrow, | |
At, | |
Tilde, | |
DoubleArrow, | |
Minus, | |
Exclamation, | |
Comma, | |
Underscore, | |
KW_Case, | |
KW_Class, | |
KW_Data, | |
KW_Default, | |
KW_Deriving, | |
KW_Do, | |
KW_Else, | |
KW_Foreign, | |
KW_If, | |
KW_Import, | |
KW_In, | |
KW_Infix, | |
KW_InfixL, | |
KW_InfixR, | |
KW_Instance, | |
KW_Let, | |
KW_Module, | |
KW_NewType, | |
KW_Of, | |
KW_Then, | |
KW_Type, | |
KW_Where, | |
KW_As, | |
KW_Export, | |
KW_Hiding, | |
KW_Qualified, | |
KW_Safe, | |
KW_Unsafe, | |
VarId, | |
QVarId, | |
ConId, | |
QConId, | |
VarSym, | |
QVarSym, | |
IntTok, | |
FloatTok, | |
String, | |
LeftParen, | |
RightParen, | |
SemiColon, | |
LeftCurly, | |
RightCurly, | |
LeftSquare, | |
RightSquare, | |
BackQuote | |
] | |
let HSLexer = new Lexer(allTokens) | |
let source = document.getElementById('in').value; | |
console.log(HSLexer.tokenize(source)) | |
} | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment