Last active
May 25, 2024 23:29
-
-
Save etscrivner/b4499524aaed5fccc1a9ce3635df14f0 to your computer and use it in GitHub Desktop.
Simple math expression tokenizer and parser in C.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdio.h> | |
#include <stdbool.h> | |
#include <stdarg.h> | |
typedef enum eToken_Type eToken_Type; | |
enum eToken_Type { | |
TOKENTYPE_number, | |
TOKENTYPE_operator, | |
TOKENTYPE_lparen, | |
TOKENTYPE_rparen, | |
TOKENTYPE_COUNT | |
}; | |
typedef enum eOperation eOperation; | |
enum eOperation { | |
OPERATION_add, | |
OPERATION_sub, | |
OPERATION_mul, | |
OPERATION_div, | |
OPERATION_COUNT | |
}; | |
typedef struct Token Token; | |
struct Token { | |
eToken_Type Type; | |
union { | |
int Literal; | |
eOperation Operation; | |
}; | |
}; | |
typedef enum eNode_Type eNode_Type; | |
enum eNode_Type { | |
NODETYPE_literal, | |
NODETYPE_bin_op, | |
NODETYPE_COUNT | |
}; | |
typedef struct Syntax_Node Syntax_Node; | |
struct Syntax_Node { | |
eNode_Type Type; | |
union { | |
int Literal; | |
struct { | |
eOperation Op; | |
Syntax_Node* Left; | |
Syntax_Node* Right; | |
} BinOp; | |
}; | |
}; | |
static const char* kNameFromOperation[OPERATION_COUNT] = { | |
[OPERATION_add] = "Add", | |
[OPERATION_sub] = "Subtract", | |
[OPERATION_mul] = "Multiply", | |
[OPERATION_div] = "Divide" | |
}; | |
/* | |
================= | |
Tokenize | |
================= | |
*/ | |
void Tokenize(const char* str, Token* out_tokens, int* out_tokens_count, int out_tokens_max) { | |
int token_count = 0; | |
const char* s = str; | |
Token* t; | |
#define PushToken(PTR, TYPE) { assert(token_count < out_tokens_max); (PTR) = &out_tokens[token_count++]; (PTR)->Type = (TYPE); } | |
while (*s) { | |
switch (*s) { | |
case '+': { | |
PushToken(t, TOKENTYPE_operator); | |
t->Operation = OPERATION_add; | |
++s; | |
} break; | |
case '-': { | |
PushToken(t, TOKENTYPE_operator); | |
t->Operation = OPERATION_sub; | |
++s; | |
} break; | |
case '*': { | |
PushToken(t, TOKENTYPE_operator); | |
t->Operation = OPERATION_mul; | |
++s; | |
} break; | |
case '/': { | |
PushToken(t, TOKENTYPE_operator); | |
t->Operation = OPERATION_div; | |
++s; | |
} break; | |
case '(': { | |
PushToken(t, TOKENTYPE_lparen); | |
++s; | |
} break; | |
case ')': { | |
PushToken(t, TOKENTYPE_rparen); | |
++s; | |
} break; | |
case ' ': /* fallthrough */ | |
case '\t': /* fallthrough */ | |
case '\n': /* fallthrough */ | |
case '\r': { | |
++s; | |
} break; | |
default: { | |
if (isdigit(*s)) { | |
int value = 0; | |
while (*s && isdigit(*s)) { | |
value *= 10; | |
value += (*s - '0'); | |
++s; | |
} | |
PushToken(t, TOKENTYPE_number); | |
t->Literal = value; | |
} else { | |
fprintf(stderr, "invalid character: [%c]\n", *s); | |
exit(1); | |
} | |
} break; | |
}; | |
} | |
*out_tokens_count = token_count; | |
#undef PushToken | |
} | |
/* | |
================= | |
PrintTokens | |
================= | |
*/ | |
void PrintTokens(Token* tokens, int token_count) { | |
for (int i = 0; i < token_count; ++i) { | |
switch (tokens[i].Type) { | |
case TOKENTYPE_operator: { | |
printf("OPERATOR(%s)\n", kNameFromOperation[tokens[i].Operation]); | |
} break; | |
case TOKENTYPE_lparen: { | |
printf("LPAREN\n"); | |
} break; | |
case TOKENTYPE_rparen: { | |
printf("RPAREN\n"); | |
} break; | |
case TOKENTYPE_number: { | |
printf("NUMBER(%d)\n", tokens[i].Literal); | |
} break; | |
default: break; | |
} | |
} | |
} | |
typedef struct Parser_State Parser_State; | |
struct Parser_State { | |
Token* Token; | |
Token* TokenOnePastLast; | |
Syntax_Node* NodePool; | |
int NodePoolIdx; | |
int NodePoolCount; | |
int ParenDepth; | |
bool HasError; | |
char ErrorMsg[64]; | |
}; | |
/* | |
================= | |
ParseError | |
================= | |
*/ | |
void ParseError(Parser_State* p, const char* restrict fmt, ...) { | |
p->HasError = true; | |
va_list args; | |
va_start(args, fmt); | |
vsnprintf(p->ErrorMsg, 64, fmt, args); | |
va_end(args); | |
} | |
/* | |
================= | |
MaybeToken | |
================= | |
*/ | |
bool MaybeToken(Parser_State*p, eToken_Type type) { | |
bool result = false; | |
if (!p->HasError && p->Token->Type == type) { | |
++p->Token; | |
result = true; | |
} | |
return(result); | |
} | |
/* | |
================= | |
PushNode | |
================= | |
*/ | |
Syntax_Node* PushNode(Parser_State* state, eNode_Type type) { | |
assert(state->NodePoolIdx < state->NodePoolCount); | |
Syntax_Node* node = &state->NodePool[state->NodePoolIdx++]; | |
node->Type = type; | |
return(node); | |
} | |
/* | |
================= | |
ParseExpression | |
================= | |
*/ | |
Syntax_Node* ParseExpression(Parser_State* state) { | |
Syntax_Node* result = NULL; | |
if (!state->HasError) { | |
Token* prev_token = state->Token; | |
if (MaybeToken(state, TOKENTYPE_number)) { | |
Syntax_Node* lhs_num = PushNode(state, NODETYPE_literal); | |
lhs_num->Literal = prev_token->Literal; | |
Token* op_token = state->Token; | |
if (MaybeToken(state, TOKENTYPE_operator)) { | |
Syntax_Node* op = PushNode(state, NODETYPE_bin_op); | |
op->BinOp.Left = lhs_num; | |
op->BinOp.Op = op_token->Operation; | |
op->BinOp.Right = ParseExpression(state); | |
result = op; | |
} else if (MaybeToken(state, TOKENTYPE_rparen)) { | |
if (state->ParenDepth > 0) { | |
--state->ParenDepth; | |
result = lhs_num; | |
} else { | |
ParseError(state, "Extra close parenthesis\n"); | |
} | |
} else { | |
result = lhs_num; | |
} | |
} else if (MaybeToken(state, TOKENTYPE_lparen)) { | |
int start_paren_depth = state->ParenDepth; | |
++state->ParenDepth; | |
result = ParseExpression(state); | |
if (state->ParenDepth == start_paren_depth) { | |
Token* op_token = state->Token; | |
if (MaybeToken(state, TOKENTYPE_operator)) { | |
Syntax_Node* op = PushNode(state, NODETYPE_bin_op); | |
op->BinOp.Left = result; | |
op->BinOp.Op = op_token->Operation; | |
op->BinOp.Right = ParseExpression(state); | |
result = op; | |
} | |
} else { | |
ParseError(state, "Unclosed parenthesis\n"); | |
} | |
} else { | |
ParseError(state, "Unexpected token: [%d]\n", state->Token->Type); | |
} | |
} | |
return(result); | |
} | |
/* | |
================= | |
Parse | |
================= | |
*/ | |
Syntax_Node* Parse(Token* tokens, int token_count, Syntax_Node* nodes, int node_max) { | |
Parser_State state = { | |
.Token = tokens, | |
.TokenOnePastLast = tokens + token_count, | |
.NodePool = nodes, | |
.NodePoolIdx = 0, | |
.NodePoolCount = node_max, | |
.ParenDepth = 0, | |
.HasError = false | |
}; | |
Syntax_Node* root = ParseExpression(&state); | |
if (state.HasError) { | |
fprintf(stderr, "Parse Error: %s\n", state.ErrorMsg); | |
root = NULL; | |
} | |
return(root); | |
} | |
/* | |
================= | |
PrintSyntaxTree | |
================= | |
*/ | |
void PrintSyntaxTree(Syntax_Node* root, int level) { | |
for (int i = 0; i < level; ++i) { | |
printf("|\t"); | |
} | |
switch (root->Type) { | |
case NODETYPE_bin_op: { | |
printf("- BinOp(%s)\n", kNameFromOperation[root->BinOp.Op]); | |
PrintSyntaxTree(root->BinOp.Left, level + 1); | |
PrintSyntaxTree(root->BinOp.Right, level + 1); | |
} break; | |
case NODETYPE_literal: { | |
printf("- Literal(%d)\n", root->Literal); | |
} break; | |
default: break; | |
} | |
} | |
void main(void) { | |
#define MAX_TOKENS 64 | |
#define MAX_NODES 64 | |
Token tokens[MAX_TOKENS]; | |
Syntax_Node nodes[MAX_NODES]; | |
int token_count; | |
Tokenize("(3 * 5) / (4 + 6)", tokens, &token_count, MAX_TOKENS); | |
PrintTokens(tokens, token_count); | |
Syntax_Node* root = Parse(tokens, token_count, nodes, MAX_NODES); | |
if (root) { | |
PrintSyntaxTree(root, 0); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment