Skip to content

Instantly share code, notes, and snippets.

@jnorthrup
Created October 27, 2024 20:51
Show Gist options
  • Save jnorthrup/ebdb481e54d8094653804e516eb844f0 to your computer and use it in GitHub Desktop.
Save jnorthrup/ebdb481e54d8094653804e516eb844f0 to your computer and use it in GitHub Desktop.
dogfooding c lexer/emitter in c
/* Function prototypes */
int putchar(int c);
int getchar(void);
void* malloc(unsigned long size);
void free(void *ptr);
/* Basic types */
typedef void* Any;
typedef const char* String;
/* Token types */
typedef enum {
TOK_EOF,
TOK_IDENT,
TOK_NUMBER,
TOK_STRING,
TOK_OPERATOR
} TokenType;
/* Token structure */
typedef struct {
TokenType type;
String text;
} Token;
/* Lexer state */
typedef struct {
String source;
unsigned long pos;
char current_char;
} Lexer;
/* Function prototypes */
void lexer_init(Lexer* lexer, String source);
void lexer_advance(Lexer* lexer);
Token lexer_get_token(Lexer* lexer);
void write_char(char c);
void write_str(const char* s);
void write_int(int n);
/* Lexer implementation */
void lexer_init(Lexer* lexer, String source) {
lexer->source = source;
lexer->pos = 0;
lexer->current_char = source[0];
}
void lexer_advance(Lexer* lexer) {
if (lexer->current_char != '\0') {
lexer->pos++;
lexer->current_char = lexer->source[lexer->pos];
}
}
void lexer_skip_whitespace(Lexer* lexer) {
while (lexer->current_char == ' ' || lexer->current_char == '\t' ||
lexer->current_char == '\n' || lexer->current_char == '\r') {
lexer_advance(lexer);
}
}
int is_letter(char c) {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == '_';
}
int is_digit(char c) {
return c >= '0' && c <= '9';
}
String lexer_collect_identifier(Lexer* lexer) {
unsigned long start_pos = lexer->pos;
while (is_letter(lexer->current_char) || is_digit(lexer->current_char)) {
lexer_advance(lexer);
}
unsigned long length = lexer->pos - start_pos;
char* text = (char*)malloc(length + 1);
unsigned long i;
for (i = 0; i < length; i++) {
text[i] = lexer->source[start_pos + i];
}
text[length] = '\0';
return text;
}
String lexer_collect_number(Lexer* lexer) {
unsigned long start_pos = lexer->pos;
while (is_digit(lexer->current_char)) {
lexer_advance(lexer);
}
unsigned long length = lexer->pos - start_pos;
char* text = (char*)malloc(length + 1);
unsigned long i;
for (i = 0; i < length; i++) {
text[i] = lexer->source[start_pos + i];
}
text[length] = '\0';
return text;
}
String lexer_collect_string(Lexer* lexer) {
lexer_advance(lexer); // Skip opening quote
unsigned long start_pos = lexer->pos;
while (lexer->current_char != '"' && lexer->current_char != '\0') {
lexer_advance(lexer);
}
unsigned long length = lexer->pos - start_pos;
char* text = (char*)malloc(length + 1);
unsigned long i;
for (i = 0; i < length; i++) {
text[i] = lexer->source[start_pos + i];
}
text[length] = '\0';
lexer_advance(lexer); // Skip closing quote
return text;
}
Token lexer_get_token(Lexer* lexer) {
lexer_skip_whitespace(lexer);
Token token;
token.text = (String)0;
if (lexer->current_char == '\0') {
token.type = TOK_EOF;
token.text = "EOF";
} else if (is_letter(lexer->current_char)) {
String ident = lexer_collect_identifier(lexer);
token.type = TOK_IDENT;
token.text = ident;
} else if (is_digit(lexer->current_char)) {
String number = lexer_collect_number(lexer);
token.type = TOK_NUMBER;
token.text = number;
} else if (lexer->current_char == '"') {
String str = lexer_collect_string(lexer);
token.type = TOK_STRING;
token.text = str;
} else {
char op = lexer->current_char;
lexer_advance(lexer);
token.type = TOK_OPERATOR;
char* text = (char*)malloc(2);
text[0] = op;
text[1] = '\0';
token.text = text;
}
return token;
}
/* Helper functions */
void write_char(char c) {
putchar(c);
}
void write_str(const char* s) {
while (*s) {
write_char(*s++);
}
}
void write_int(int n) {
if (n == 0) {
write_char('0');
return;
}
if (n < 0) {
write_char('-');
n = -n;
}
char buffer[10];
int i = 0;
while (n > 0) {
buffer[i++] = '0' + (n % 10);
n /= 10;
}
while (i > 0) {
write_char(buffer[--i]);
}
}
/* Main function */
int main() {
/* Read the entire input into a string */
unsigned long buffer_size = 1024;
char* source_code = (char*)malloc(buffer_size);
unsigned long length = 0;
int c;
while ((c = getchar()) != -1) { // EOF is typically -1
if (length >= buffer_size - 1) {
/* Reallocate buffer */
buffer_size *= 2;
char* new_buffer = (char*)malloc(buffer_size);
unsigned long i;
for (i = 0; i < length; i++) {
new_buffer[i] = source_code[i];
}
free(source_code);
source_code = new_buffer;
}
source_code[length++] = (char)c;
}
source_code[length] = '\0';
Lexer lexer;
lexer_init(&lexer, source_code);
Token token;
do {
token = lexer_get_token(&lexer);
/* Output the token */
write_str("Token: Type=");
write_int(token.type);
write_str(", Text='");
write_str(token.text);
write_str("'\n");
if (token.type == TOK_IDENT || token.type == TOK_NUMBER ||
token.type == TOK_STRING || token.type == TOK_OPERATOR) {
free((void*)token.text);
}
} while (token.type != TOK_EOF);
free(source_code);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment