Created
October 27, 2024 20:51
-
-
Save jnorthrup/ebdb481e54d8094653804e516eb844f0 to your computer and use it in GitHub Desktop.
dogfooding c lexer/emitter in c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Function prototypes */ | |
int putchar(int c); | |
int getchar(void); | |
void* malloc(unsigned long size); | |
void free(void *ptr); | |
/* Basic types */ | |
typedef void* Any; | |
typedef const char* String; | |
/* Token types */ | |
typedef enum { | |
TOK_EOF, | |
TOK_IDENT, | |
TOK_NUMBER, | |
TOK_STRING, | |
TOK_OPERATOR | |
} TokenType; | |
/* Token structure */ | |
typedef struct { | |
TokenType type; | |
String text; | |
} Token; | |
/* Lexer state */ | |
typedef struct { | |
String source; | |
unsigned long pos; | |
char current_char; | |
} Lexer; | |
/* Function prototypes */ | |
void lexer_init(Lexer* lexer, String source); | |
void lexer_advance(Lexer* lexer); | |
Token lexer_get_token(Lexer* lexer); | |
void write_char(char c); | |
void write_str(const char* s); | |
void write_int(int n); | |
/* Lexer implementation */ | |
void lexer_init(Lexer* lexer, String source) { | |
lexer->source = source; | |
lexer->pos = 0; | |
lexer->current_char = source[0]; | |
} | |
void lexer_advance(Lexer* lexer) { | |
if (lexer->current_char != '\0') { | |
lexer->pos++; | |
lexer->current_char = lexer->source[lexer->pos]; | |
} | |
} | |
void lexer_skip_whitespace(Lexer* lexer) { | |
while (lexer->current_char == ' ' || lexer->current_char == '\t' || | |
lexer->current_char == '\n' || lexer->current_char == '\r') { | |
lexer_advance(lexer); | |
} | |
} | |
int is_letter(char c) { | |
return (c >= 'a' && c <= 'z') || | |
(c >= 'A' && c <= 'Z') || | |
c == '_'; | |
} | |
int is_digit(char c) { | |
return c >= '0' && c <= '9'; | |
} | |
String lexer_collect_identifier(Lexer* lexer) { | |
unsigned long start_pos = lexer->pos; | |
while (is_letter(lexer->current_char) || is_digit(lexer->current_char)) { | |
lexer_advance(lexer); | |
} | |
unsigned long length = lexer->pos - start_pos; | |
char* text = (char*)malloc(length + 1); | |
unsigned long i; | |
for (i = 0; i < length; i++) { | |
text[i] = lexer->source[start_pos + i]; | |
} | |
text[length] = '\0'; | |
return text; | |
} | |
String lexer_collect_number(Lexer* lexer) { | |
unsigned long start_pos = lexer->pos; | |
while (is_digit(lexer->current_char)) { | |
lexer_advance(lexer); | |
} | |
unsigned long length = lexer->pos - start_pos; | |
char* text = (char*)malloc(length + 1); | |
unsigned long i; | |
for (i = 0; i < length; i++) { | |
text[i] = lexer->source[start_pos + i]; | |
} | |
text[length] = '\0'; | |
return text; | |
} | |
String lexer_collect_string(Lexer* lexer) { | |
lexer_advance(lexer); // Skip opening quote | |
unsigned long start_pos = lexer->pos; | |
while (lexer->current_char != '"' && lexer->current_char != '\0') { | |
lexer_advance(lexer); | |
} | |
unsigned long length = lexer->pos - start_pos; | |
char* text = (char*)malloc(length + 1); | |
unsigned long i; | |
for (i = 0; i < length; i++) { | |
text[i] = lexer->source[start_pos + i]; | |
} | |
text[length] = '\0'; | |
lexer_advance(lexer); // Skip closing quote | |
return text; | |
} | |
Token lexer_get_token(Lexer* lexer) { | |
lexer_skip_whitespace(lexer); | |
Token token; | |
token.text = (String)0; | |
if (lexer->current_char == '\0') { | |
token.type = TOK_EOF; | |
token.text = "EOF"; | |
} else if (is_letter(lexer->current_char)) { | |
String ident = lexer_collect_identifier(lexer); | |
token.type = TOK_IDENT; | |
token.text = ident; | |
} else if (is_digit(lexer->current_char)) { | |
String number = lexer_collect_number(lexer); | |
token.type = TOK_NUMBER; | |
token.text = number; | |
} else if (lexer->current_char == '"') { | |
String str = lexer_collect_string(lexer); | |
token.type = TOK_STRING; | |
token.text = str; | |
} else { | |
char op = lexer->current_char; | |
lexer_advance(lexer); | |
token.type = TOK_OPERATOR; | |
char* text = (char*)malloc(2); | |
text[0] = op; | |
text[1] = '\0'; | |
token.text = text; | |
} | |
return token; | |
} | |
/* Helper functions */ | |
void write_char(char c) { | |
putchar(c); | |
} | |
void write_str(const char* s) { | |
while (*s) { | |
write_char(*s++); | |
} | |
} | |
void write_int(int n) { | |
if (n == 0) { | |
write_char('0'); | |
return; | |
} | |
if (n < 0) { | |
write_char('-'); | |
n = -n; | |
} | |
char buffer[10]; | |
int i = 0; | |
while (n > 0) { | |
buffer[i++] = '0' + (n % 10); | |
n /= 10; | |
} | |
while (i > 0) { | |
write_char(buffer[--i]); | |
} | |
} | |
/* Main function */ | |
int main() { | |
/* Read the entire input into a string */ | |
unsigned long buffer_size = 1024; | |
char* source_code = (char*)malloc(buffer_size); | |
unsigned long length = 0; | |
int c; | |
while ((c = getchar()) != -1) { // EOF is typically -1 | |
if (length >= buffer_size - 1) { | |
/* Reallocate buffer */ | |
buffer_size *= 2; | |
char* new_buffer = (char*)malloc(buffer_size); | |
unsigned long i; | |
for (i = 0; i < length; i++) { | |
new_buffer[i] = source_code[i]; | |
} | |
free(source_code); | |
source_code = new_buffer; | |
} | |
source_code[length++] = (char)c; | |
} | |
source_code[length] = '\0'; | |
Lexer lexer; | |
lexer_init(&lexer, source_code); | |
Token token; | |
do { | |
token = lexer_get_token(&lexer); | |
/* Output the token */ | |
write_str("Token: Type="); | |
write_int(token.type); | |
write_str(", Text='"); | |
write_str(token.text); | |
write_str("'\n"); | |
if (token.type == TOK_IDENT || token.type == TOK_NUMBER || | |
token.type == TOK_STRING || token.type == TOK_OPERATOR) { | |
free((void*)token.text); | |
} | |
} while (token.type != TOK_EOF); | |
free(source_code); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment