Skip to content

Instantly share code, notes, and snippets.

@etscrivner
Last active May 25, 2024 23:29
Show Gist options
  • Save etscrivner/b4499524aaed5fccc1a9ce3635df14f0 to your computer and use it in GitHub Desktop.
Save etscrivner/b4499524aaed5fccc1a9ce3635df14f0 to your computer and use it in GitHub Desktop.
Simple math expression tokenizer and parser in C.
#include <assert.h>
#include <stdio.h>
#include <stdbool.h>
#include <stdarg.h>
typedef enum eToken_Type eToken_Type;
enum eToken_Type {
TOKENTYPE_number,
TOKENTYPE_operator,
TOKENTYPE_lparen,
TOKENTYPE_rparen,
TOKENTYPE_COUNT
};
typedef enum eOperation eOperation;
enum eOperation {
OPERATION_add,
OPERATION_sub,
OPERATION_mul,
OPERATION_div,
OPERATION_COUNT
};
typedef struct Token Token;
struct Token {
eToken_Type Type;
union {
int Literal;
eOperation Operation;
};
};
typedef enum eNode_Type eNode_Type;
enum eNode_Type {
NODETYPE_literal,
NODETYPE_bin_op,
NODETYPE_COUNT
};
typedef struct Syntax_Node Syntax_Node;
struct Syntax_Node {
eNode_Type Type;
union {
int Literal;
struct {
eOperation Op;
Syntax_Node* Left;
Syntax_Node* Right;
} BinOp;
};
};
static const char* kNameFromOperation[OPERATION_COUNT] = {
[OPERATION_add] = "Add",
[OPERATION_sub] = "Subtract",
[OPERATION_mul] = "Multiply",
[OPERATION_div] = "Divide"
};
/*
=================
Tokenize
=================
*/
void Tokenize(const char* str, Token* out_tokens, int* out_tokens_count, int out_tokens_max) {
int token_count = 0;
const char* s = str;
Token* t;
#define PushToken(PTR, TYPE) { assert(token_count < out_tokens_max); (PTR) = &out_tokens[token_count++]; (PTR)->Type = (TYPE); }
while (*s) {
switch (*s) {
case '+': {
PushToken(t, TOKENTYPE_operator);
t->Operation = OPERATION_add;
++s;
} break;
case '-': {
PushToken(t, TOKENTYPE_operator);
t->Operation = OPERATION_sub;
++s;
} break;
case '*': {
PushToken(t, TOKENTYPE_operator);
t->Operation = OPERATION_mul;
++s;
} break;
case '/': {
PushToken(t, TOKENTYPE_operator);
t->Operation = OPERATION_div;
++s;
} break;
case '(': {
PushToken(t, TOKENTYPE_lparen);
++s;
} break;
case ')': {
PushToken(t, TOKENTYPE_rparen);
++s;
} break;
case ' ': /* fallthrough */
case '\t': /* fallthrough */
case '\n': /* fallthrough */
case '\r': {
++s;
} break;
default: {
if (isdigit(*s)) {
int value = 0;
while (*s && isdigit(*s)) {
value *= 10;
value += (*s - '0');
++s;
}
PushToken(t, TOKENTYPE_number);
t->Literal = value;
} else {
fprintf(stderr, "invalid character: [%c]\n", *s);
exit(1);
}
} break;
};
}
*out_tokens_count = token_count;
#undef PushToken
}
/*
=================
PrintTokens
=================
*/
void PrintTokens(Token* tokens, int token_count) {
for (int i = 0; i < token_count; ++i) {
switch (tokens[i].Type) {
case TOKENTYPE_operator: {
printf("OPERATOR(%s)\n", kNameFromOperation[tokens[i].Operation]);
} break;
case TOKENTYPE_lparen: {
printf("LPAREN\n");
} break;
case TOKENTYPE_rparen: {
printf("RPAREN\n");
} break;
case TOKENTYPE_number: {
printf("NUMBER(%d)\n", tokens[i].Literal);
} break;
default: break;
}
}
}
typedef struct Parser_State Parser_State;
struct Parser_State {
Token* Token;
Token* TokenOnePastLast;
Syntax_Node* NodePool;
int NodePoolIdx;
int NodePoolCount;
int ParenDepth;
bool HasError;
char ErrorMsg[64];
};
/*
=================
ParseError
=================
*/
void ParseError(Parser_State* p, const char* restrict fmt, ...) {
p->HasError = true;
va_list args;
va_start(args, fmt);
vsnprintf(p->ErrorMsg, 64, fmt, args);
va_end(args);
}
/*
=================
MaybeToken
=================
*/
bool MaybeToken(Parser_State*p, eToken_Type type) {
bool result = false;
if (!p->HasError && p->Token->Type == type) {
++p->Token;
result = true;
}
return(result);
}
/*
=================
PushNode
=================
*/
Syntax_Node* PushNode(Parser_State* state, eNode_Type type) {
assert(state->NodePoolIdx < state->NodePoolCount);
Syntax_Node* node = &state->NodePool[state->NodePoolIdx++];
node->Type = type;
return(node);
}
/*
=================
ParseExpression
=================
*/
Syntax_Node* ParseExpression(Parser_State* state) {
Syntax_Node* result = NULL;
if (!state->HasError) {
Token* prev_token = state->Token;
if (MaybeToken(state, TOKENTYPE_number)) {
Syntax_Node* lhs_num = PushNode(state, NODETYPE_literal);
lhs_num->Literal = prev_token->Literal;
Token* op_token = state->Token;
if (MaybeToken(state, TOKENTYPE_operator)) {
Syntax_Node* op = PushNode(state, NODETYPE_bin_op);
op->BinOp.Left = lhs_num;
op->BinOp.Op = op_token->Operation;
op->BinOp.Right = ParseExpression(state);
result = op;
} else if (MaybeToken(state, TOKENTYPE_rparen)) {
if (state->ParenDepth > 0) {
--state->ParenDepth;
result = lhs_num;
} else {
ParseError(state, "Extra close parenthesis\n");
}
} else {
result = lhs_num;
}
} else if (MaybeToken(state, TOKENTYPE_lparen)) {
int start_paren_depth = state->ParenDepth;
++state->ParenDepth;
result = ParseExpression(state);
if (state->ParenDepth == start_paren_depth) {
Token* op_token = state->Token;
if (MaybeToken(state, TOKENTYPE_operator)) {
Syntax_Node* op = PushNode(state, NODETYPE_bin_op);
op->BinOp.Left = result;
op->BinOp.Op = op_token->Operation;
op->BinOp.Right = ParseExpression(state);
result = op;
}
} else {
ParseError(state, "Unclosed parenthesis\n");
}
} else {
ParseError(state, "Unexpected token: [%d]\n", state->Token->Type);
}
}
return(result);
}
/*
=================
Parse
=================
*/
Syntax_Node* Parse(Token* tokens, int token_count, Syntax_Node* nodes, int node_max) {
Parser_State state = {
.Token = tokens,
.TokenOnePastLast = tokens + token_count,
.NodePool = nodes,
.NodePoolIdx = 0,
.NodePoolCount = node_max,
.ParenDepth = 0,
.HasError = false
};
Syntax_Node* root = ParseExpression(&state);
if (state.HasError) {
fprintf(stderr, "Parse Error: %s\n", state.ErrorMsg);
root = NULL;
}
return(root);
}
/*
=================
PrintSyntaxTree
=================
*/
void PrintSyntaxTree(Syntax_Node* root, int level) {
for (int i = 0; i < level; ++i) {
printf("|\t");
}
switch (root->Type) {
case NODETYPE_bin_op: {
printf("- BinOp(%s)\n", kNameFromOperation[root->BinOp.Op]);
PrintSyntaxTree(root->BinOp.Left, level + 1);
PrintSyntaxTree(root->BinOp.Right, level + 1);
} break;
case NODETYPE_literal: {
printf("- Literal(%d)\n", root->Literal);
} break;
default: break;
}
}
void main(void) {
#define MAX_TOKENS 64
#define MAX_NODES 64
Token tokens[MAX_TOKENS];
Syntax_Node nodes[MAX_NODES];
int token_count;
Tokenize("(3 * 5) / (4 + 6)", tokens, &token_count, MAX_TOKENS);
PrintTokens(tokens, token_count);
Syntax_Node* root = Parse(tokens, token_count, nodes, MAX_NODES);
if (root) {
PrintSyntaxTree(root, 0);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment