rcampbell · May 5, 2011 10:12
diff --git a/lexer.js b/lexer.js
 //
 // Lexical Analyzer for Scheme R5RS
 // 
 // TODO: numbers, better error reporting
 //
 // Depends on Functional Javascript: http://osteele.com/sources/javascript/functional/
 //

 Functional.install();

 String.prototype.tokenize = (function() {
     var expressionKeywords = ['quote', 'lambda', 'if', 'set!', 'begin', 'cond', 'and', 'or', 'case', 'let', 'let*', 'letrec', 'do', 'delay', 'quasiquote'];
     var syntacticKeywords  = ['else', '=>', 'define', 'unquote', 'unquote-splicing'].concat(expressionKeywords);   
     var identifierPattern  = /^((?:\+|-|\.\.\.)|[0-9A-Za-z!\$%&\*/:\<=\>\?\^_~\+-\.@]*)[ \n\(\)";]/;
     var tokens = [{pattern:/^( )+/,                     type:'space'}
                  ,{pattern:/^(\n)+/,                    type:'newline'}                
                  ,{pattern:/^(;.*)/,                    type:'comment'}
                  ,{pattern:/^(\()/,                     type:'lparen',     parse:I}
                  ,{pattern:/^(\))/,                     type:'rparen',     parse:I}
                  ,{pattern:/^([0-9]+)[ \n\(\)";]/,      type:'number',     parse:function(x){ return parseInt(x); }}
                  ,{pattern:/^(#t)/,                     type:'boolean',    parse:K(true)}
                  ,{pattern:/^(#f)/,                     type:'boolean',    parse:K(false)}
                  ,{pattern:/^(#\\space)[ \n\(\)";]/,    type:'character',  parse:K(' ')}
                  ,{pattern:/^(#\\newline)[ \n\(\)";]/,  type:'character',  parse:K('\n')}
                  ,{pattern:/^(#\\.)[ \n\(\)";]/,        type:'character',  parse:'x.substring(2)'.lambda()}
                  ,{pattern:/^("(?:[^"\\]|\\"|\\\\)*")/, type:'string',     parse:'x.substring(1, x.length - 1)'.lambda()}
                  ,{pattern:identifierPattern,           type:'identifier', parse:I, identify:true}
                  ]; 
     var getErrorSnippet = function(program) {
         var MAX = 10;
         var snippet = /\S+/.exec(program);
         return (snippet && snippet[0] && snippet[0].substring(0, MAX)) || '';
     };
     return function() {
         var program = this ? this + ' ' : false; // EOF delimiter for implicit termination
         var output = (function() {
             var symbols = {};
             var lexemes = [];
             return {
                 addSymbol: function(lexeme) {
                     if (symbols[lexeme]) {
                         // noop
                     } else {
                         symbols[lexeme] = {
                             type: some('==="'+lexeme+'"', syntacticKeywords) ? 'system' : 'user'                    
                         };
                     }
                 },
                 symbols: function() {
                     return symbols;
                 },
                 addLexeme: function(lexeme) {
                     lexemes.push(lexeme);
                 },
                 lexemes: function() {
                     return lexemes;
                 }
             };
         }());
         var begin = 0, end = 0, line = 1, character = 0;
         var i, token, match, lexeme, error;
         var make = function() {
             var instance = Object.create(token);
             instance.value = token.parse(lexeme);
             instance.line = line;
             instance.begin = begin;
             instance.end = end;             
             return instance;
         };
         while (program) {
             for (i = 0; i < tokens.length; i += 1) {
                 token = tokens[i];
                 match = token.pattern.exec(program);
                 if (match) break;
             }
             if (match) {
                 lexeme = match[1];
                 end += lexeme.length;
                 if (lexeme === '\n') {
                     line += 1;
                     character = 0;
                 }
                 if (token.parse) {
                     output.addLexeme(make());      
                 }
                 if (token.identify) {
                     output.addSymbol(lexeme);                     
                 }
                 begin += lexeme.length;
                 character += lexeme.length;
                 program = program.substring(lexeme.length);
             } else {
                 error = getErrorSnippet(program);
                 throw {
                     name: 'UnknownToken',
                     message: 'Unknown token starting on line ' + line + ', character ' + character + (error ? (': ' + error) : '.')                     
                 };
             }
         }
         console.log(map(function(x){ return x.begin + '-' + x.end + ':' + x.type + ':' + x.value; }, output.lexemes()));
         return output;
     };
 }());
	//
	// Lexical Analyzer for Scheme R5RS
	//
	// TODO: numbers, better error reporting
	//
	// Depends on Functional Javascript: http://osteele.com/sources/javascript/functional/
	//

	Functional.install();

	String.prototype.tokenize = (function() {
	var expressionKeywords = ['quote', 'lambda', 'if', 'set!', 'begin', 'cond', 'and', 'or', 'case', 'let', 'let*', 'letrec', 'do', 'delay', 'quasiquote'];
	var syntacticKeywords = ['else', '=>', 'define', 'unquote', 'unquote-splicing'].concat(expressionKeywords);
	var identifierPattern = /^((?:\+\|-\|\.\.\.)\|[0-9A-Za-z!\$%&\/:\<=\>\?\^_~\+-\.@])[ \n\(\)";]/;
	var tokens = [{pattern:/^( )+/, type:'space'}
	,{pattern:/^(\n)+/, type:'newline'}
	,{pattern:/^(;.*)/, type:'comment'}
	,{pattern:/^(\()/, type:'lparen', parse:I}
	,{pattern:/^(\))/, type:'rparen', parse:I}
	,{pattern:/^([0-9]+)[ \n\(\)";]/, type:'number', parse:function(x){ return parseInt(x); }}
	,{pattern:/^(#t)/, type:'boolean', parse:K(true)}
	,{pattern:/^(#f)/, type:'boolean', parse:K(false)}
	,{pattern:/^(#\\space)[ \n\(\)";]/, type:'character', parse:K(' ')}
	,{pattern:/^(#\\newline)[ \n\(\)";]/, type:'character', parse:K('\n')}
	,{pattern:/^(#\\.)[ \n\(\)";]/, type:'character', parse:'x.substring(2)'.lambda()}
	,{pattern:/^("(?:[^"\\]\|\\"\|\\\\)*")/, type:'string', parse:'x.substring(1, x.length - 1)'.lambda()}
	,{pattern:identifierPattern, type:'identifier', parse:I, identify:true}
	];
	var getErrorSnippet = function(program) {
	var MAX = 10;
	var snippet = /\S+/.exec(program);
	return (snippet && snippet[0] && snippet[0].substring(0, MAX)) \|\| '';
	};
	return function() {
	var program = this ? this + ' ' : false; // EOF delimiter for implicit termination
	var output = (function() {
	var symbols = {};
	var lexemes = [];
	return {
	addSymbol: function(lexeme) {
	if (symbols[lexeme]) {
	// noop
	} else {
	symbols[lexeme] = {
	type: some('==="'+lexeme+'"', syntacticKeywords) ? 'system' : 'user'
	};
	}
	},
	symbols: function() {
	return symbols;
	},
	addLexeme: function(lexeme) {
	lexemes.push(lexeme);
	},
	lexemes: function() {
	return lexemes;
	}
	};
	}());
	var begin = 0, end = 0, line = 1, character = 0;
	var i, token, match, lexeme, error;
	var make = function() {
	var instance = Object.create(token);
	instance.value = token.parse(lexeme);
	instance.line = line;
	instance.begin = begin;
	instance.end = end;
	return instance;
	};
	while (program) {
	for (i = 0; i < tokens.length; i += 1) {
	token = tokens[i];
	match = token.pattern.exec(program);
	if (match) break;
	}
	if (match) {
	lexeme = match[1];
	end += lexeme.length;
	if (lexeme === '\n') {
	line += 1;
	character = 0;
	}
	if (token.parse) {
	output.addLexeme(make());
	}
	if (token.identify) {
	output.addSymbol(lexeme);
	}
	begin += lexeme.length;
	character += lexeme.length;
	program = program.substring(lexeme.length);
	} else {
	error = getErrorSnippet(program);
	throw {
	name: 'UnknownToken',
	message: 'Unknown token starting on line ' + line + ', character ' + character + (error ? (': ' + error) : '.')
	};
	}
	}
	console.log(map(function(x){ return x.begin + '-' + x.end + ':' + x.type + ':' + x.value; }, output.lexemes()));
	return output;
	};
	}());