Skip to content

Instantly share code, notes, and snippets.

@nixorn
Last active March 7, 2019 14:06
Show Gist options
  • Save nixorn/a8f1acb9c957f513e6072c0fcde8141b to your computer and use it in GitHub Desktop.
Save nixorn/a8f1acb9c957f513e6072c0fcde8141b to your computer and use it in GitHub Desktop.
Sample of chevrotain based haskell lexer with indent support (poorly tested)
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="content-type">
</head>
<body>
<div>
Input:
<textarea id="in" name="input" cols="40" rows="3""></textarea>
<button onclick="lex()"> Parse </button>
</div>
<script type="text/javascript" src="https://unpkg.com/[email protected]/lib/chevrotain.min.js"> </script>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/lodash.min.js"></script>
<script type="text/javascript">
function lex() {
const { createToken, createTokenInstance, Lexer } = chevrotain;
const WS = createToken({ name: 'WS',group: Lexer.SKIPPED, pattern: / +/})
const VarId = createToken({ name: 'VarId', pattern: /[a-z_]\w+/})
const QVarId = createToken({ name: 'QVarId', pattern: /\w+\.[a-z_]\w+/})
const ConId = createToken({ name: 'ConId', pattern: /\[A-Z]w+/})
const QConId = createToken({ name: 'ConId', pattern: /\w+\.[A-Z]\w+/})
const VarSym = createToken({ name: 'VarSym', pattern: /[:!#%&*.\/?@$+<=>^|~\\-]+/})
//const ConSym = createToken({ name: 'ConSym', pattern: /[:!#%&*.\/?@$+<=>^|~\\-]+/})
const QVarSym = createToken({ name: 'QVarSym', pattern: /\w+\.[:!#%&*.\/?@$+<=>^|~\\-]+/})
//const QConSym = createToken({ name: 'QConSym', pattern: /\w+\.[:!#%&*.\/?@$+<=>^|~\\-]+/})
const IntTok = createToken({ name: 'IntTok', pattern: /\d+/})
const FloatTok = createToken({ name: 'FloatTok', pattern: /\d+\.\d+/})
const String = createToken({ name: 'String', pattern: /[\w, ]+/})
const LeftParen = createToken({ name: 'LeftParen', pattern: '('})
const RightParen = createToken({ name: 'RightParen', pattern: ')'})
const SemiColon = createToken({ name: 'SemiColon', pattern: ';'})
const LeftCurly = createToken({ name: 'LeftCurly', pattern: '{'})
const RightCurly = createToken({ name: 'RightCurly', pattern: '}'})
const LeftSquare = createToken({ name: 'LeftSquare', pattern: '['})
const RightSquare = createToken({ name: 'RightSquare', pattern: ']'})
const Comma = createToken({ name: 'Comma', pattern: ','})
const Underscore = createToken({ name: 'Underscore', pattern: '_'})
const BackQuote = createToken({ name: 'BackQuote', pattern: '`'})
const DotDot = createToken({ name: 'DotDot', pattern: '\.\.'})
const Colon = createToken({ name: 'Colon', pattern: ':'})
const DoubleColon = createToken({ name: 'DoubleColon', pattern: '::'})
const Equals = createToken({ name: 'Equals', pattern: '='})
const Backslash = createToken({ name: 'Backslash', pattern: '\\'})
const Bar = createToken({ name: 'Bar', pattern: '|'})
const LeftArrow = createToken({ name: 'LeftArrow', pattern: '<-'})
const RightArrow = createToken({ name: 'RightArrow', pattern: '->'})
const At = createToken({ name: 'At', pattern: '@'})
const Tilde = createToken({ name: 'Tilde', pattern: '~'})
const DoubleArrow = createToken({ name: 'DoubleArrow', pattern: '=>'})
const Minus = createToken({ name: 'Minus', pattern: '-'})
const Exclamation = createToken({ name: 'Exclamation', pattern: '!'})
const KW_Case = createToken({ name: 'KW_Case', pattern: 'case'})
const KW_Class = createToken({ name: 'KW_Class', pattern: 'class'})
const KW_Data = createToken({ name: 'KW_Data', pattern: 'data'})
const KW_Default = createToken({ name: 'KW_Default', pattern: 'default'})
const KW_Deriving = createToken({ name: 'KW_Deriving', pattern: 'deriving'})
const KW_Do = createToken({ name: 'KW_Do', pattern: 'do'})
const KW_Else = createToken({ name: 'KW_Else', pattern: 'else'})
const KW_Foreign = createToken({ name: 'KW_Foreign', pattern: 'foreign'})
const KW_If = createToken({ name: 'KW_If', pattern: 'if'})
const KW_Import = createToken({ name: 'KW_Import', pattern: 'import'})
const KW_In = createToken({ name: 'KW_In', pattern: 'in'})
const KW_Infix = createToken({ name: 'KW_Infix', pattern: 'infix'})
const KW_InfixL = createToken({ name: 'KW_InfixL', pattern: 'infixl'})
const KW_InfixR = createToken({ name: 'KW_InfixR', pattern: 'infixr'})
const KW_Instance = createToken({ name: 'KW_Instance', pattern: 'instance'})
const KW_Let = createToken({ name: 'KW_Let', pattern: 'let'})
const KW_Module = createToken({ name: 'KW_Module', pattern: 'module'})
const KW_NewType = createToken({ name: 'KW_NewType', pattern: 'newtype'})
const KW_Of = createToken({ name: 'KW_Of', pattern: 'of'})
const KW_Then = createToken({ name: 'KW_Then', pattern: 'then'})
const KW_Type = createToken({name: 'KW_Type', pattern: 'type'})
const KW_Where = createToken({name: 'KW_Where', pattern: 'where'})
const KW_As = createToken({name: 'KW_As', pattern: 'as'})
const KW_Export = createToken({name: 'KW_Export', pattern: 'export'})
const KW_Hiding = createToken({name: 'KW_Hiding', pattern: 'hiding'})
const KW_Qualified = createToken({name: 'KW_Qualified', pattern: 'qualified'})
const KW_Safe = createToken({name: 'KW_Safe', pattern: 'safe'})
const KW_Unsafe = createToken({name: 'KW_Unsafe', pattern: 'unsafe'}
let indentStack = [0]
let lastOffsetChecked
function matchWhiteSpace(text, startOffset) {
let result = ""
let offset = startOffset
while (text[offset] === " ") {
offset++
result += " "
}
if (result === "") {
return null
}
return [result]
}
function matchIndentBase(text, offset, matchedTokens, groups, type) {
const noTokensMatchedYet = _.isEmpty(matchedTokens)
const newLines = groups.nl
const noNewLinesMatchedYet = _.isEmpty(newLines)
const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet
const isStartOfLine =
// only newlines matched so far
(noTokensMatchedYet && !noNewLinesMatchedYet) ||
// Both newlines and other Tokens have been matched AND the last matched Token is a newline
(!noTokensMatchedYet &&
!noNewLinesMatchedYet &&
(!_.isEmpty(newLines) &&
!_.isEmpty(matchedTokens) &&
_.last(newLines).startOffset) >
_.last(matchedTokens).startOffset)
// indentation can only be matched at the start of a line.
if (isFirstLine || isStartOfLine) {
let match
let currIndentLevel = undefined
const isZeroIndent = text.length < offset && text[offset] !== " "
if (isZeroIndent) {
// Matching zero spaces Outdent would not consume any chars, thus it would cause an infinite loop.
// This check prevents matching a sequence of zero spaces outdents.
if (lastOffsetChecked !== offset) {
currIndentLevel = 0
match = [""]
lastOffsetChecked = offset
}
} else {
// possible non-empty indentation
match = matchWhiteSpace(text, offset)
if (match !== null) {
currIndentLevel = match[0].length
}
}
if (currIndentLevel !== undefined) {
const lastIndentLevel = _.last(indentStack)
if (currIndentLevel > lastIndentLevel && type === "indent") {
indentStack.push(currIndentLevel)
return match
} else if (
currIndentLevel < lastIndentLevel &&
type === "outdent"
) {
//if we need more than one outdent token, add all but the last one
if (indentStack.length > 2) {
const image = ""
const offset = _.last(matchedTokens).endOffset + 1
const line = _.last(matchedTokens).endLine
const column = _.last(matchedTokens).endColumn + 1
while (
indentStack.length > 2 &&
//stop before the last Outdent
indentStack[indentStack.length - 2] > currIndentLevel
) {
indentStack.pop()
matchedTokens.push(
createTokenInstance(
Outdent,
"",
NaN,
NaN,
NaN,
NaN,
NaN,
NaN
)
)
}
}
indentStack.pop()
return match
} else {
// same indent, this should be lexed as simple whitespace and ignored
return null
}
} else {
// indentation cannot be matched without at least one space character.
return null
}
} else {
// indentation cannot be matched under other circumstances
return null
}
}
const matchIndent = _.partialRight(matchIndentBase, "indent")
const matchOutdent = _.partialRight(matchIndentBase, "outdent")
const Newline = createToken({
name: "Newline",
pattern: /\n|\r\n?/,
group: "nl"
})
// define the indentation tokens using custom token patterns
const Indent = createToken({
name: "Indent",
pattern: matchIndent,
// custom token patterns should explicitly specify the line_breaks option
line_breaks: false
})
const Outdent = createToken({
name: "Outdent",
pattern: matchOutdent,
// custom token patterns should explicitly specify the line_breaks option
line_breaks: false
})
let allTokens = [
Newline, Indent, Outdent,
WS,
DotDot,
Colon,
DoubleColon,
Equals,
Backslash,
Bar,
LeftArrow,
RightArrow,
At,
Tilde,
DoubleArrow,
Minus,
Exclamation,
Comma,
Underscore,
KW_Case,
KW_Class,
KW_Data,
KW_Default,
KW_Deriving,
KW_Do,
KW_Else,
KW_Foreign,
KW_If,
KW_Import,
KW_In,
KW_Infix,
KW_InfixL,
KW_InfixR,
KW_Instance,
KW_Let,
KW_Module,
KW_NewType,
KW_Of,
KW_Then,
KW_Type,
KW_Where,
KW_As,
KW_Export,
KW_Hiding,
KW_Qualified,
KW_Safe,
KW_Unsafe,
VarId,
QVarId,
ConId,
QConId,
VarSym,
QVarSym,
IntTok,
FloatTok,
String,
LeftParen,
RightParen,
SemiColon,
LeftCurly,
RightCurly,
LeftSquare,
RightSquare,
BackQuote
]
let HSLexer = new Lexer(allTokens)
let source = document.getElementById('in').value;
console.log(HSLexer.tokenize(source))
}
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment