nixorn · March 7, 2019 14:06
diff --git a/index.html b/index.html
 <html>
    <head>
        <meta content="text/html; charset=utf-8" http-equiv="content-type">
    </head>
    <body>
        <div>
            Input:
            <textarea id="in" name="input" cols="40" rows="3""></textarea>
            <button onclick="lex()"> Parse </button>
        </div>
        <script type="text/javascript" src="https://unpkg.com/[email protected]/lib/chevrotain.min.js"> </script>
        <script src="https://cdn.jsdelivr.net/npm/[email protected]/lodash.min.js"></script>
        <script type="text/javascript">
         function lex() {
             const { createToken, createTokenInstance, Lexer } = chevrotain;

             const WS           = createToken({ name: 'WS',group: Lexer.SKIPPED,  pattern: / +/})
             const VarId        = createToken({ name: 'VarId',     pattern: /[a-z_]\w+/})
             const QVarId       = createToken({ name: 'QVarId',    pattern: /\w+\.[a-z_]\w+/})
             const ConId        = createToken({ name: 'ConId',     pattern: /\[A-Z]w+/})
             const QConId       = createToken({ name: 'ConId',     pattern: /\w+\.[A-Z]\w+/})
             const VarSym       = createToken({ name: 'VarSym',    pattern: /[:!#%&*.\/?@$+<=>^|~\\-]+/})
             //const ConSym       = createToken({ name: 'ConSym',    pattern: /[:!#%&*.\/?@$+<=>^|~\\-]+/})
             const QVarSym      = createToken({ name: 'QVarSym',   pattern: /\w+\.[:!#%&*.\/?@$+<=>^|~\\-]+/})
             //const QConSym      = createToken({ name: 'QConSym',   pattern: /\w+\.[:!#%&*.\/?@$+<=>^|~\\-]+/})
             const IntTok       = createToken({ name: 'IntTok',    pattern: /\d+/})
             const FloatTok     = createToken({ name: 'FloatTok',  pattern: /\d+\.\d+/})
             const String       = createToken({ name: 'String',    pattern: /[\w, ]+/})
             const LeftParen    = createToken({ name: 'LeftParen',  pattern: '('})
             const RightParen   = createToken({ name: 'RightParen', pattern: ')'})
             const SemiColon    = createToken({ name: 'SemiColon',  pattern: ';'})
             const LeftCurly    = createToken({ name: 'LeftCurly',  pattern: '{'})
             const RightCurly   = createToken({ name: 'RightCurly', pattern: '}'})
             const LeftSquare   = createToken({ name: 'LeftSquare', pattern: '['})
             const RightSquare  = createToken({ name: 'RightSquare', pattern: ']'})
             const Comma        = createToken({ name: 'Comma',       pattern: ','})
             const Underscore   = createToken({ name: 'Underscore',  pattern: '_'})
             const BackQuote    = createToken({ name: 'BackQuote',   pattern: '`'})
             const DotDot       = createToken({ name: 'DotDot',      pattern: '\.\.'})
             const Colon        = createToken({ name: 'Colon',       pattern: ':'})
             const DoubleColon  = createToken({ name: 'DoubleColon', pattern: '::'})
             const Equals       = createToken({ name: 'Equals',      pattern: '='})
             const Backslash    = createToken({ name: 'Backslash',   pattern: '\\'})
             const Bar          = createToken({ name: 'Bar',         pattern: '|'})
             const LeftArrow    = createToken({ name: 'LeftArrow',   pattern: '<-'})
             const RightArrow   = createToken({ name: 'RightArrow',  pattern: '->'})
             const At           = createToken({ name: 'At',          pattern: '@'})
             const Tilde        = createToken({ name: 'Tilde',       pattern: '~'})
             const DoubleArrow  = createToken({ name: 'DoubleArrow', pattern: '=>'})
             const Minus        = createToken({ name: 'Minus',       pattern: '-'})
             const Exclamation  = createToken({ name: 'Exclamation', pattern: '!'})
             const KW_Case      = createToken({ name: 'KW_Case',     pattern: 'case'})
             const KW_Class     = createToken({ name: 'KW_Class',    pattern: 'class'})
             const KW_Data      = createToken({ name: 'KW_Data',     pattern: 'data'})
             const KW_Default   = createToken({ name: 'KW_Default',  pattern: 'default'})
             const KW_Deriving  = createToken({ name: 'KW_Deriving', pattern: 'deriving'})
             const KW_Do        = createToken({ name: 'KW_Do',       pattern: 'do'})
             const KW_Else      = createToken({ name: 'KW_Else',     pattern: 'else'})
             const KW_Foreign   = createToken({ name: 'KW_Foreign',  pattern: 'foreign'})
             const KW_If        = createToken({ name: 'KW_If',       pattern: 'if'})
             const KW_Import    = createToken({ name: 'KW_Import',   pattern: 'import'})
             const KW_In        = createToken({ name: 'KW_In',       pattern: 'in'})
             const KW_Infix     = createToken({ name: 'KW_Infix',    pattern: 'infix'})
             const KW_InfixL    = createToken({ name: 'KW_InfixL',   pattern: 'infixl'})
             const KW_InfixR    = createToken({ name: 'KW_InfixR',   pattern: 'infixr'})
             const KW_Instance  = createToken({ name: 'KW_Instance', pattern: 'instance'})
             const KW_Let       = createToken({ name: 'KW_Let',      pattern: 'let'})
             const KW_Module    = createToken({ name: 'KW_Module',   pattern: 'module'})
             const KW_NewType   = createToken({ name: 'KW_NewType',  pattern: 'newtype'})
             const KW_Of        = createToken({ name: 'KW_Of',       pattern: 'of'})
             const KW_Then      = createToken({ name: 'KW_Then',     pattern: 'then'})
             const KW_Type      = createToken({name: 'KW_Type',      pattern: 'type'})
             const KW_Where     = createToken({name: 'KW_Where',     pattern: 'where'})
             const KW_As        = createToken({name: 'KW_As',        pattern: 'as'})
             const KW_Export    = createToken({name: 'KW_Export',    pattern: 'export'})
             const KW_Hiding    = createToken({name: 'KW_Hiding',    pattern: 'hiding'})
             const KW_Qualified = createToken({name: 'KW_Qualified', pattern: 'qualified'})
             const KW_Safe      = createToken({name: 'KW_Safe',      pattern: 'safe'})
             const KW_Unsafe    = createToken({name: 'KW_Unsafe',    pattern: 'unsafe'}

             let indentStack = [0]
             let lastOffsetChecked

             function matchWhiteSpace(text, startOffset) {
                 let result = ""
                 let offset = startOffset
                 while (text[offset] === " ") {
                     offset++
                     result += " "
                 }

                 if (result === "") {
                     return null
                 }

                 return [result]
             }
             function matchIndentBase(text, offset, matchedTokens, groups, type) {
                 const noTokensMatchedYet = _.isEmpty(matchedTokens)
                 const newLines = groups.nl
                 const noNewLinesMatchedYet = _.isEmpty(newLines)
                 const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet
                 const isStartOfLine =
                     // only newlines matched so far
                     (noTokensMatchedYet && !noNewLinesMatchedYet) ||
                     // Both newlines and other Tokens have been matched AND the last matched Token is a newline
                     (!noTokensMatchedYet &&
                      !noNewLinesMatchedYet &&
                      (!_.isEmpty(newLines) &&
                       !_.isEmpty(matchedTokens) &&
                       _.last(newLines).startOffset) >
                         _.last(matchedTokens).startOffset)

                 // indentation can only be matched at the start of a line.
                 if (isFirstLine || isStartOfLine) {
                     let match
                     let currIndentLevel = undefined
                     const isZeroIndent = text.length < offset && text[offset] !== " "
                     if (isZeroIndent) {
                         // Matching zero spaces Outdent would not consume any chars, thus it would cause an infinite loop.
                         // This check prevents matching a sequence of zero spaces outdents.
                         if (lastOffsetChecked !== offset) {
                             currIndentLevel = 0
                             match = [""]
                             lastOffsetChecked = offset
                         }
                     } else {
                         // possible non-empty indentation
                         match = matchWhiteSpace(text, offset)
                         if (match !== null) {
                             currIndentLevel = match[0].length
                         }
                     }

                     if (currIndentLevel !== undefined) {
                         const lastIndentLevel = _.last(indentStack)
                         if (currIndentLevel > lastIndentLevel && type === "indent") {
                             indentStack.push(currIndentLevel)
                             return match
                         } else if (
                             currIndentLevel < lastIndentLevel &&
                             type === "outdent"
                         ) {
                             //if we need more than one outdent token, add all but the last one
                             if (indentStack.length > 2) {
                                 const image = ""
                                 const offset = _.last(matchedTokens).endOffset + 1
                                 const line = _.last(matchedTokens).endLine
                                 const column = _.last(matchedTokens).endColumn + 1
                                 while (
                                     indentStack.length > 2 &&
                                     //stop before the last Outdent
                                     indentStack[indentStack.length - 2] > currIndentLevel
                                 ) {
                                     indentStack.pop()
                                     matchedTokens.push(
                                         createTokenInstance(
                                             Outdent,
                                             "",
                                             NaN,
                                             NaN,
                                             NaN,
                                             NaN,
                                             NaN,
                                             NaN
                                         )
                                     )
                                 }
                             }
                             indentStack.pop()
                             return match
                         } else {
                             // same indent, this should be lexed as simple whitespace and ignored
                             return null
                         }
                     } else {
                         // indentation cannot be matched without at least one space character.
                         return null
                     }
                 } else {
                     // indentation cannot be matched under other circumstances
                     return null
                 }
             }

             const matchIndent = _.partialRight(matchIndentBase, "indent")
             const matchOutdent = _.partialRight(matchIndentBase, "outdent")

             const Newline = createToken({
                 name: "Newline",
                 pattern: /\n|\r\n?/,
                 group: "nl"
             })

             // define the indentation tokens using custom token patterns
             const Indent = createToken({
                 name: "Indent",
                 pattern: matchIndent,
                 // custom token patterns should explicitly specify the line_breaks option
                 line_breaks: false
             })
             const Outdent = createToken({
                 name: "Outdent",
                 pattern: matchOutdent,
                 // custom token patterns should explicitly specify the line_breaks option
                 line_breaks: false
             })


             let allTokens = [
                 Newline, Indent, Outdent,
                 WS,
                 DotDot,
                 Colon,
                 DoubleColon,
                 Equals,
                 Backslash,
                 Bar,
                 LeftArrow,
                 RightArrow,
                 At,
                 Tilde,
                 DoubleArrow,
                 Minus,
                 Exclamation,
                 Comma,
                 Underscore,
                 KW_Case,
                 KW_Class,
                 KW_Data,
                 KW_Default,
                 KW_Deriving,
                 KW_Do,
                 KW_Else,
                 KW_Foreign,
                 KW_If,
                 KW_Import,
                 KW_In,
                 KW_Infix,
                 KW_InfixL,
                 KW_InfixR,
                 KW_Instance,
                 KW_Let,
                 KW_Module,
                 KW_NewType,
                 KW_Of,
                 KW_Then,
                 KW_Type,
                 KW_Where,
                 KW_As,
                 KW_Export,
                 KW_Hiding,
                 KW_Qualified,
                 KW_Safe,
                 KW_Unsafe,
                 VarId,
                 QVarId,
                 ConId,
                 QConId,
                 VarSym,
                 QVarSym,
                 IntTok,
                 FloatTok,
                 String,
                 LeftParen,
                 RightParen,
                 SemiColon,
                 LeftCurly,
                 RightCurly,
                 LeftSquare,
                 RightSquare,
                 BackQuote
             ]

             let HSLexer = new Lexer(allTokens)

             let source = document.getElementById('in').value;
             console.log(HSLexer.tokenize(source))

         }
        </script>
    </body>
 </html>
	<html>
	<head>
	<meta content="text/html; charset=utf-8" http-equiv="content-type">
	</head>
	<body>
	<div>
	Input:
	<textarea id="in" name="input" cols="40" rows="3""></textarea>
	<button onclick="lex()"> Parse </button>
	</div>
	<script type="text/javascript" src="https://unpkg.com/[email protected]/lib/chevrotain.min.js"> </script>
	<script src="https://cdn.jsdelivr.net/npm/[email protected]/lodash.min.js"></script>
	<script type="text/javascript">
	function lex() {
	const { createToken, createTokenInstance, Lexer } = chevrotain;

	const WS = createToken({ name: 'WS',group: Lexer.SKIPPED, pattern: / +/})
	const VarId = createToken({ name: 'VarId', pattern: /[a-z_]\w+/})
	const QVarId = createToken({ name: 'QVarId', pattern: /\w+\.[a-z_]\w+/})
	const ConId = createToken({ name: 'ConId', pattern: /\[A-Z]w+/})
	const QConId = createToken({ name: 'ConId', pattern: /\w+\.[A-Z]\w+/})
	const VarSym = createToken({ name: 'VarSym', pattern: /[:!#%&*.\/?@$+<=>^\|~\\-]+/})
	//const ConSym = createToken({ name: 'ConSym', pattern: /[:!#%&*.\/?@$+<=>^\|~\\-]+/})
	const QVarSym = createToken({ name: 'QVarSym', pattern: /\w+\.[:!#%&*.\/?@$+<=>^\|~\\-]+/})
	//const QConSym = createToken({ name: 'QConSym', pattern: /\w+\.[:!#%&*.\/?@$+<=>^\|~\\-]+/})
	const IntTok = createToken({ name: 'IntTok', pattern: /\d+/})
	const FloatTok = createToken({ name: 'FloatTok', pattern: /\d+\.\d+/})
	const String = createToken({ name: 'String', pattern: /[\w, ]+/})
	const LeftParen = createToken({ name: 'LeftParen', pattern: '('})
	const RightParen = createToken({ name: 'RightParen', pattern: ')'})
	const SemiColon = createToken({ name: 'SemiColon', pattern: ';'})
	const LeftCurly = createToken({ name: 'LeftCurly', pattern: '{'})
	const RightCurly = createToken({ name: 'RightCurly', pattern: '}'})
	const LeftSquare = createToken({ name: 'LeftSquare', pattern: '['})
	const RightSquare = createToken({ name: 'RightSquare', pattern: ']'})
	const Comma = createToken({ name: 'Comma', pattern: ','})
	const Underscore = createToken({ name: 'Underscore', pattern: '_'})
	const BackQuote = createToken({ name: 'BackQuote', pattern: '`'})
	const DotDot = createToken({ name: 'DotDot', pattern: '\.\.'})
	const Colon = createToken({ name: 'Colon', pattern: ':'})
	const DoubleColon = createToken({ name: 'DoubleColon', pattern: '::'})
	const Equals = createToken({ name: 'Equals', pattern: '='})
	const Backslash = createToken({ name: 'Backslash', pattern: '\\'})
	const Bar = createToken({ name: 'Bar', pattern: '\|'})
	const LeftArrow = createToken({ name: 'LeftArrow', pattern: '<-'})
	const RightArrow = createToken({ name: 'RightArrow', pattern: '->'})
	const At = createToken({ name: 'At', pattern: '@'})
	const Tilde = createToken({ name: 'Tilde', pattern: '~'})
	const DoubleArrow = createToken({ name: 'DoubleArrow', pattern: '=>'})
	const Minus = createToken({ name: 'Minus', pattern: '-'})
	const Exclamation = createToken({ name: 'Exclamation', pattern: '!'})
	const KW_Case = createToken({ name: 'KW_Case', pattern: 'case'})
	const KW_Class = createToken({ name: 'KW_Class', pattern: 'class'})
	const KW_Data = createToken({ name: 'KW_Data', pattern: 'data'})
	const KW_Default = createToken({ name: 'KW_Default', pattern: 'default'})
	const KW_Deriving = createToken({ name: 'KW_Deriving', pattern: 'deriving'})
	const KW_Do = createToken({ name: 'KW_Do', pattern: 'do'})
	const KW_Else = createToken({ name: 'KW_Else', pattern: 'else'})
	const KW_Foreign = createToken({ name: 'KW_Foreign', pattern: 'foreign'})
	const KW_If = createToken({ name: 'KW_If', pattern: 'if'})
	const KW_Import = createToken({ name: 'KW_Import', pattern: 'import'})
	const KW_In = createToken({ name: 'KW_In', pattern: 'in'})
	const KW_Infix = createToken({ name: 'KW_Infix', pattern: 'infix'})
	const KW_InfixL = createToken({ name: 'KW_InfixL', pattern: 'infixl'})
	const KW_InfixR = createToken({ name: 'KW_InfixR', pattern: 'infixr'})
	const KW_Instance = createToken({ name: 'KW_Instance', pattern: 'instance'})
	const KW_Let = createToken({ name: 'KW_Let', pattern: 'let'})
	const KW_Module = createToken({ name: 'KW_Module', pattern: 'module'})
	const KW_NewType = createToken({ name: 'KW_NewType', pattern: 'newtype'})
	const KW_Of = createToken({ name: 'KW_Of', pattern: 'of'})
	const KW_Then = createToken({ name: 'KW_Then', pattern: 'then'})
	const KW_Type = createToken({name: 'KW_Type', pattern: 'type'})
	const KW_Where = createToken({name: 'KW_Where', pattern: 'where'})
	const KW_As = createToken({name: 'KW_As', pattern: 'as'})
	const KW_Export = createToken({name: 'KW_Export', pattern: 'export'})
	const KW_Hiding = createToken({name: 'KW_Hiding', pattern: 'hiding'})
	const KW_Qualified = createToken({name: 'KW_Qualified', pattern: 'qualified'})
	const KW_Safe = createToken({name: 'KW_Safe', pattern: 'safe'})
	const KW_Unsafe = createToken({name: 'KW_Unsafe', pattern: 'unsafe'}

	let indentStack = [0]
	let lastOffsetChecked

	function matchWhiteSpace(text, startOffset) {
	let result = ""
	let offset = startOffset
	while (text[offset] === " ") {
	offset++
	result += " "
	}

	if (result === "") {
	return null
	}

	return [result]
	}
	function matchIndentBase(text, offset, matchedTokens, groups, type) {
	const noTokensMatchedYet = _.isEmpty(matchedTokens)
	const newLines = groups.nl
	const noNewLinesMatchedYet = _.isEmpty(newLines)
	const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet
	const isStartOfLine =
	// only newlines matched so far
	(noTokensMatchedYet && !noNewLinesMatchedYet) \|\|
	// Both newlines and other Tokens have been matched AND the last matched Token is a newline
	(!noTokensMatchedYet &&
	!noNewLinesMatchedYet &&
	(!_.isEmpty(newLines) &&
	!_.isEmpty(matchedTokens) &&
	_.last(newLines).startOffset) >
	_.last(matchedTokens).startOffset)

	// indentation can only be matched at the start of a line.
	if (isFirstLine \|\| isStartOfLine) {
	let match
	let currIndentLevel = undefined
	const isZeroIndent = text.length < offset && text[offset] !== " "
	if (isZeroIndent) {
	// Matching zero spaces Outdent would not consume any chars, thus it would cause an infinite loop.
	// This check prevents matching a sequence of zero spaces outdents.
	if (lastOffsetChecked !== offset) {
	currIndentLevel = 0
	match = [""]
	lastOffsetChecked = offset
	}
	} else {
	// possible non-empty indentation
	match = matchWhiteSpace(text, offset)
	if (match !== null) {
	currIndentLevel = match[0].length
	}
	}

	if (currIndentLevel !== undefined) {
	const lastIndentLevel = _.last(indentStack)
	if (currIndentLevel > lastIndentLevel && type === "indent") {
	indentStack.push(currIndentLevel)
	return match
	} else if (
	currIndentLevel < lastIndentLevel &&
	type === "outdent"
	) {
	//if we need more than one outdent token, add all but the last one
	if (indentStack.length > 2) {
	const image = ""
	const offset = _.last(matchedTokens).endOffset + 1
	const line = _.last(matchedTokens).endLine
	const column = _.last(matchedTokens).endColumn + 1
	while (
	indentStack.length > 2 &&
	//stop before the last Outdent
	indentStack[indentStack.length - 2] > currIndentLevel
	) {
	indentStack.pop()
	matchedTokens.push(
	createTokenInstance(
	Outdent,
	"",
	NaN,
	NaN,
	NaN,
	NaN,
	NaN,
	NaN
	)
	)
	}
	}
	indentStack.pop()
	return match
	} else {
	// same indent, this should be lexed as simple whitespace and ignored
	return null
	}
	} else {
	// indentation cannot be matched without at least one space character.
	return null
	}
	} else {
	// indentation cannot be matched under other circumstances
	return null
	}
	}

	const matchIndent = _.partialRight(matchIndentBase, "indent")
	const matchOutdent = _.partialRight(matchIndentBase, "outdent")

	const Newline = createToken({
	name: "Newline",
	pattern: /\n\|\r\n?/,
	group: "nl"
	})

	// define the indentation tokens using custom token patterns
	const Indent = createToken({
	name: "Indent",
	pattern: matchIndent,
	// custom token patterns should explicitly specify the line_breaks option
	line_breaks: false
	})
	const Outdent = createToken({
	name: "Outdent",
	pattern: matchOutdent,
	// custom token patterns should explicitly specify the line_breaks option
	line_breaks: false
	})


	let allTokens = [
	Newline, Indent, Outdent,
	WS,
	DotDot,
	Colon,
	DoubleColon,
	Equals,
	Backslash,
	Bar,
	LeftArrow,
	RightArrow,
	At,
	Tilde,
	DoubleArrow,
	Minus,
	Exclamation,
	Comma,
	Underscore,
	KW_Case,
	KW_Class,
	KW_Data,
	KW_Default,
	KW_Deriving,
	KW_Do,
	KW_Else,
	KW_Foreign,
	KW_If,
	KW_Import,
	KW_In,
	KW_Infix,
	KW_InfixL,
	KW_InfixR,
	KW_Instance,
	KW_Let,
	KW_Module,
	KW_NewType,
	KW_Of,
	KW_Then,
	KW_Type,
	KW_Where,
	KW_As,
	KW_Export,
	KW_Hiding,
	KW_Qualified,
	KW_Safe,
	KW_Unsafe,
	VarId,
	QVarId,
	ConId,
	QConId,
	VarSym,
	QVarSym,
	IntTok,
	FloatTok,
	String,
	LeftParen,
	RightParen,
	SemiColon,
	LeftCurly,
	RightCurly,
	LeftSquare,
	RightSquare,
	BackQuote
	]

	let HSLexer = new Lexer(allTokens)

	let source = document.getElementById('in').value;
	console.log(HSLexer.tokenize(source))

	}
	</script>
	</body>
	</html>