Created
May 21, 2011 23:20
-
-
Save od0x0/984994 to your computer and use it in GitHub Desktop.
Lexer (not sure if this is my broken one)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <stdbool.h> | |
#include <iso646.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <ctype.h> | |
typedef enum{ | |
LX1LexerErrorNone, | |
LX1LexerErrorUnknown | |
}LX1LexerError; | |
typedef enum{ | |
LX1LexerTokenTypeNone, | |
LX1LexerTokenTypeComma, | |
LX1LexerTokenTypeIdentifier, | |
LX1LexerTokenTypeOperator, | |
LX1LexerTokenTypeBar, | |
LX1LexerTokenTypeNumber, | |
LX1LexerTokenTypeString, | |
LX1LexerTokenTypeAssign, | |
LX1LexerTokenTypeOpeningParenthesis, | |
LX1LexerTokenTypeClosingParenthesis, | |
LX1LexerTokenTypeTerminal | |
}LX1LexerTokenType; | |
typedef struct LX1Lexer LX1Lexer; | |
struct LX1Lexer{ | |
//LX1Lexer State | |
size_t currentLineNumber; | |
long currentCharacterOfLineNumber; | |
LX1LexerTokenType lastTokenType; | |
//Buffering text | |
char* buffer; | |
size_t bufferUsed, bufferAllocated; | |
//Error handling | |
LX1LexerError lastError; | |
//Input | |
char (*pump)(LX1Lexer* self); | |
char (*peek)(LX1Lexer* self); | |
void* userdata; | |
//LX1Lexer configurations | |
}; | |
char LX1LexerPump(LX1Lexer* self){ | |
char current = 0; | |
if(self->pump) current = self->pump(self); | |
if (current == '\n') { | |
self->currentLineNumber++; | |
self->currentCharacterOfLineNumber=1; | |
} | |
else self->currentCharacterOfLineNumber++; | |
return current; | |
} | |
char LX1LexerPeek(LX1Lexer* self){ | |
if(self->peek) return self->peek(self); | |
return 0; | |
} | |
void LX1LexerAddStringToBuffer(LX1Lexer* self, const char* string){ | |
if(string==NULL) return; | |
if(string[0]==0) return; | |
size_t length = strlen(string); | |
if(self->bufferUsed + length + 1 > self->bufferAllocated){ | |
self->bufferAllocated = self->bufferUsed + length + 15; | |
self->buffer = realloc(self->buffer, self->bufferAllocated); | |
} | |
memcpy(self->buffer + self->bufferUsed, string, length); | |
self->bufferUsed += length; | |
self->buffer[self->bufferUsed] = '\0'; | |
} | |
void LX1LexerAddCharacterToBuffer(LX1Lexer* self, const char c){ | |
if(c==0) return; | |
const char string[2] = {c, 0}; | |
LX1LexerAddStringToBuffer(self, string); | |
} | |
void LX1LexerClearBuffer(LX1Lexer* self){ | |
free(self->buffer); | |
self->buffer = NULL; | |
self->bufferUsed = 0; | |
self->bufferAllocated = 0; | |
} | |
static inline void LX1LexerSetBuffer(LX1Lexer* self, const char* string){ | |
LX1LexerClearBuffer(self); | |
LX1LexerAddStringToBuffer(self, string); | |
} | |
static inline bool IsAlpha(char c){ | |
if('A' <= c and c <= 'Z') return true; | |
if('a' <= c and c <= 'z') return true; | |
return false; | |
} | |
static inline bool IsDigit(char c){ | |
if('0' <= c and c <= '9') return true; | |
return false; | |
} | |
static bool LX1LexerLexNumber(LX1Lexer* self, char c){ | |
while (c) { | |
if(not IsDigit(c)) break; | |
LX1LexerAddCharacterToBuffer(self, c); | |
char next = LX1LexerPeek(self); | |
if(not (IsDigit(next) or next=='.')) return true; | |
c = LX1LexerPump(self); | |
} | |
return false; | |
} | |
static bool LX1LexerLexIdentifier(LX1Lexer* self, char c){ | |
bool firstCharacter = true; | |
while (c) { | |
if(IsAlpha(c) or c == '_' or (firstCharacter and c == '#') or ((not firstCharacter) and IsDigit(c))){ | |
LX1LexerAddCharacterToBuffer(self, c); | |
char next = LX1LexerPeek(self); | |
if(not (IsAlpha(next) or next == '_' or IsDigit(next))) return true; | |
} | |
else break; | |
firstCharacter=false; | |
c = LX1LexerPump(self); | |
} | |
return false; | |
} | |
static bool LX1LexerLexString(LX1Lexer* self, char c){ | |
if(c not_eq '"') return false; | |
while (c = LX1LexerPump(self)) { | |
switch (c) { | |
case '"': | |
return true; | |
case '\\': | |
switch (LX1LexerPeek(self)) { | |
case 'n': | |
LX1LexerAddCharacterToBuffer(self, '\n'); | |
LX1LexerPump(self); | |
break; | |
case 't': | |
LX1LexerAddCharacterToBuffer(self, '\t'); | |
LX1LexerPump(self); | |
break; | |
case '\\': | |
LX1LexerAddCharacterToBuffer(self, '\\'); | |
LX1LexerPump(self); | |
break; | |
case '"': | |
LX1LexerAddCharacterToBuffer(self, '"'); | |
LX1LexerPump(self); | |
break; | |
} | |
break; | |
default: | |
LX1LexerAddCharacterToBuffer(self, c); | |
break; | |
} | |
} | |
return false; | |
} | |
static void LX1LexerEatSingleLineComment(LX1Lexer* self, char c){ | |
while (c) { | |
if(LX1LexerPeek(self) == '\n') return; | |
c = LX1LexerPump(self); | |
} | |
} | |
LX1LexerError LX1LexerLex(LX1Lexer* self){ | |
char c = 0; | |
LX1LexerTokenType tokenType = LX1LexerTokenTypeNone; | |
LX1LexerError error = LX1LexerErrorNone; | |
LX1LexerClearBuffer(self); | |
while(c = LX1LexerPump(self)){ | |
//char s[]={'>','>',' ',c,'\0'};puts(s); | |
switch (c) { | |
case '\n': | |
case ';': | |
tokenType = LX1LexerTokenTypeTerminal; | |
goto end; | |
case '=': | |
if(LX1LexerPeek(self) == '='){ | |
LX1LexerPump(self); | |
tokenType = LX1LexerTokenTypeOperator; | |
LX1LexerSetBuffer(self, "=="); | |
} | |
else tokenType = LX1LexerTokenTypeAssign; | |
goto end; | |
case '<': | |
tokenType = LX1LexerTokenTypeOperator; | |
if(LX1LexerPeek(self)=='='){ | |
LX1LexerPump(self); | |
LX1LexerSetBuffer(self, "<="); | |
} | |
else LX1LexerSetBuffer(self, "<"); | |
goto end; | |
case '>': | |
tokenType = LX1LexerTokenTypeOperator; | |
if(LX1LexerPeek(self) == '='){ | |
LX1LexerPump(self); | |
LX1LexerSetBuffer(self, "<="); | |
} | |
else LX1LexerSetBuffer(self, "<"); | |
goto end; | |
break; | |
case ',': | |
tokenType = LX1LexerTokenTypeComma; | |
goto end; | |
case '|': | |
tokenType = LX1LexerTokenTypeBar; | |
goto end; | |
case '+': | |
tokenType = LX1LexerTokenTypeOperator; | |
LX1LexerSetBuffer(self, "+"); | |
goto end; | |
case '-': | |
tokenType = LX1LexerTokenTypeOperator; | |
LX1LexerSetBuffer(self, "-"); | |
goto end; | |
case '*': | |
tokenType = LX1LexerTokenTypeOperator; | |
LX1LexerSetBuffer(self, "*"); | |
goto end; | |
case '/': | |
if(LX1LexerPeek(self) not_eq '/'){ | |
//It's a division operator | |
tokenType = LX1LexerTokenTypeOperator; | |
LX1LexerSetBuffer(self, "/"); | |
//goto end; | |
} | |
else LX1LexerEatSingleLineComment(self, c); | |
goto end; | |
case '(': | |
tokenType = LX1LexerTokenTypeOpeningParenthesis; | |
goto end; | |
case ')': | |
tokenType = LX1LexerTokenTypeClosingParenthesis; | |
goto end; | |
case '\"': | |
if(LX1LexerLexString(self, c)) tokenType=LX1LexerTokenTypeString; | |
else error=LX1LexerErrorUnknown; | |
goto end; | |
case ' ': | |
case '\t': | |
break; | |
default: | |
if(LX1LexerLexNumber(self, c)) tokenType = LX1LexerTokenTypeNumber; | |
else if(LX1LexerLexIdentifier(self, c)) tokenType = LX1LexerTokenTypeIdentifier; | |
else error=LX1LexerErrorUnknown; | |
if(tokenType or error) goto end; | |
break; | |
} | |
}; | |
//This is where all the gotos go to. | |
end: | |
self->lastTokenType = tokenType; | |
self->lastError = error; | |
return error; | |
} | |
void LX1LexerInit(LX1Lexer* self){ | |
memset(self, 0, sizeof(LX1Lexer)); | |
self->currentLineNumber=1; | |
} | |
void LX1LexerDeinit(LX1Lexer* self){ | |
LX1LexerClearBuffer(self); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment