Last active
September 3, 2024 17:58
-
-
Save seanjensengrey/874a1dcdb7b40407ac916dd2090051a4 to your computer and use it in GitHub Desktop.
Marc Feeley Tiny C compiler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* file: "tinyc.c" */ | |
/* originally from http://www.iro.umontreal.ca/~felipe/IFT2030-Automne2002/Complements/tinyc.c */ | |
/* Copyright (C) 2001 by Marc Feeley, All Rights Reserved. */ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
/* | |
* This is a compiler for the Tiny-C language. Tiny-C is a | |
* considerably stripped down version of C and it is meant as a | |
* pedagogical tool for learning about compilers. The integer global | |
* variables "a" to "z" are predefined and initialized to zero, and it | |
* is not possible to declare new variables. The compiler reads the | |
* program from standard input and prints out the value of the | |
* variables that are not zero. The grammar of Tiny-C in EBNF is: | |
* | |
* <program> ::= <statement> | |
* <statement> ::= "if" <paren_expr> <statement> | | |
* "if" <paren_expr> <statement> "else" <statement> | | |
* "while" <paren_expr> <statement> | | |
* "do" <statement> "while" <paren_expr> ";" | | |
* "{" { <statement> } "}" | | |
* <expr> ";" | | |
* ";" | |
* <paren_expr> ::= "(" <expr> ")" | |
* <expr> ::= <test> | <id> "=" <expr> | |
* <test> ::= <sum> | <sum> "<" <sum> | |
* <sum> ::= <term> | <sum> "+" <term> | <sum> "-" <term> | |
* <term> ::= <id> | <int> | <paren_expr> | |
* <id> ::= "a" | "b" | "c" | "d" | ... | "z" | |
* <int> ::= <an_unsigned_decimal_integer> | |
* | |
* Here are a few invocations of the compiler: | |
* | |
* % echo "a=b=c=2<3;" | ./a.out | |
* a = 1 | |
* b = 1 | |
* c = 1 | |
* % echo "{ i=1; while (i<100) i=i+i; }" | ./a.out | |
* i = 128 | |
* % echo "{ i=125; j=100; while (i-j) if (i<j) j=j-i; else i=i-j; }" | ./a.out | |
* i = 25 | |
* j = 25 | |
* % echo "{ i=1; do i=i+10; while (i<50); }" | ./a.out | |
* i = 51 | |
* % echo "{ i=1; while ((i=i+10)<50) ; }" | ./a.out | |
* i = 51 | |
* % echo "{ i=7; if (i<5) x=1; if (i<10) y=2; }" | ./a.out | |
* i = 7 | |
* y = 2 | |
* | |
* The compiler does a minimal amount of error checking to help | |
* highlight the structure of the compiler. | |
*/ | |
/*---------------------------------------------------------------------------*/ | |
/* Lexer. */ | |
enum { DO_SYM, ELSE_SYM, IF_SYM, WHILE_SYM, LBRA, RBRA, LPAR, RPAR, | |
PLUS, MINUS, LESS, SEMI, EQUAL, INT, ID, EOI }; | |
char *words[] = { "do", "else", "if", "while", NULL }; | |
int ch = ' '; | |
int sym; | |
int int_val; | |
char id_name[100]; | |
void syntax_error() { fprintf(stderr, "syntax error\n"); exit(1); } | |
void next_ch() { ch = getchar(); } | |
void next_sym() | |
{ again: switch (ch) | |
{ case ' ': case '\n': next_ch(); goto again; | |
case EOF: sym = EOI; break; | |
case '{': next_ch(); sym = LBRA; break; | |
case '}': next_ch(); sym = RBRA; break; | |
case '(': next_ch(); sym = LPAR; break; | |
case ')': next_ch(); sym = RPAR; break; | |
case '+': next_ch(); sym = PLUS; break; | |
case '-': next_ch(); sym = MINUS; break; | |
case '<': next_ch(); sym = LESS; break; | |
case ';': next_ch(); sym = SEMI; break; | |
case '=': next_ch(); sym = EQUAL; break; | |
default: | |
if (ch >= '0' && ch <= '9') | |
{ int_val = 0; /* missing overflow check */ | |
while (ch >= '0' && ch <= '9') | |
{ int_val = int_val*10 + (ch - '0'); next_ch(); } | |
sym = INT; | |
} | |
else if (ch >= 'a' && ch <= 'z') | |
{ int i = 0; /* missing overflow check */ | |
while ((ch >= 'a' && ch <= 'z') || ch == '_') | |
{ id_name[i++] = ch; next_ch(); } | |
id_name[i] = '\0'; | |
sym = 0; | |
while (words[sym] != NULL && strcmp(words[sym], id_name) != 0) | |
sym++; | |
if (words[sym] == NULL) | |
if (id_name[1] == '\0') sym = ID; else syntax_error(); | |
} | |
else | |
syntax_error(); | |
} | |
} | |
/*---------------------------------------------------------------------------*/ | |
/* Parser. */ | |
enum { VAR, CST, ADD, SUB, LT, SET, | |
IF1, IF2, WHILE, DO, EMPTY, SEQ, EXPR, PROG }; | |
struct node { int kind; struct node *o1, *o2, *o3; int val; }; | |
typedef struct node node; | |
node *new_node(int k) | |
{ node *x = (node*)malloc(sizeof(node)); x->kind = k; return x; } | |
node *paren_expr(); /* forward declaration */ | |
node *term() /* <term> ::= <id> | <int> | <paren_expr> */ | |
{ node *x; | |
if (sym == ID) { x=new_node(VAR); x->val=id_name[0]-'a'; next_sym(); } | |
else if (sym == INT) { x=new_node(CST); x->val=int_val; next_sym(); } | |
else x = paren_expr(); | |
return x; | |
} | |
node *sum() /* <sum> ::= <term> | <sum> "+" <term> | <sum> "-" <term> */ | |
{ node *t, *x = term(); | |
while (sym == PLUS || sym == MINUS) | |
{ t=x; x=new_node(sym==PLUS?ADD:SUB); next_sym(); x->o1=t; x->o2=term(); } | |
return x; | |
} | |
node *test() /* <test> ::= <sum> | <sum> "<" <sum> */ | |
{ node *t, *x = sum(); | |
if (sym == LESS) | |
{ t=x; x=new_node(LT); next_sym(); x->o1=t; x->o2=sum(); } | |
return x; | |
} | |
node *expr() /* <expr> ::= <test> | <id> "=" <expr> */ | |
{ node *t, *x; | |
if (sym != ID) return test(); | |
x = test(); | |
if (x->kind == VAR && sym == EQUAL) | |
{ t=x; x=new_node(SET); next_sym(); x->o1=t; x->o2=expr(); } | |
return x; | |
} | |
node *paren_expr() /* <paren_expr> ::= "(" <expr> ")" */ | |
{ node *x; | |
if (sym == LPAR) next_sym(); else syntax_error(); | |
x = expr(); | |
if (sym == RPAR) next_sym(); else syntax_error(); | |
return x; | |
} | |
node *statement() | |
{ node *t, *x; | |
if (sym == IF_SYM) /* "if" <paren_expr> <statement> */ | |
{ x = new_node(IF1); | |
next_sym(); | |
x->o1 = paren_expr(); | |
x->o2 = statement(); | |
if (sym == ELSE_SYM) /* ... "else" <statement> */ | |
{ x->kind = IF2; | |
next_sym(); | |
x->o3 = statement(); | |
} | |
} | |
else if (sym == WHILE_SYM) /* "while" <paren_expr> <statement> */ | |
{ x = new_node(WHILE); | |
next_sym(); | |
x->o1 = paren_expr(); | |
x->o2 = statement(); | |
} | |
else if (sym == DO_SYM) /* "do" <statement> "while" <paren_expr> ";" */ | |
{ x = new_node(DO); | |
next_sym(); | |
x->o1 = statement(); | |
if (sym == WHILE_SYM) next_sym(); else syntax_error(); | |
x->o2 = paren_expr(); | |
if (sym == SEMI) next_sym(); else syntax_error(); | |
} | |
else if (sym == SEMI) /* ";" */ | |
{ x = new_node(EMPTY); next_sym(); } | |
else if (sym == LBRA) /* "{" { <statement> } "}" */ | |
{ x = new_node(EMPTY); | |
next_sym(); | |
while (sym != RBRA) | |
{ t=x; x=new_node(SEQ); x->o1=t; x->o2=statement(); } | |
next_sym(); | |
} | |
else /* <expr> ";" */ | |
{ x = new_node(EXPR); | |
x->o1 = expr(); | |
if (sym == SEMI) next_sym(); else syntax_error(); | |
} | |
return x; | |
} | |
node *program() /* <program> ::= <statement> */ | |
{ node *x = new_node(PROG); | |
next_sym(); x->o1 = statement(); if (sym != EOI) syntax_error(); | |
return x; | |
} | |
/*---------------------------------------------------------------------------*/ | |
/* Code generator. */ | |
enum { IFETCH, ISTORE, IPUSH, IPOP, IADD, ISUB, ILT, JZ, JNZ, JMP, HALT }; | |
typedef char code; | |
code object[1000], *here = object; | |
void g(code c) { *here++ = c; } /* missing overflow check */ | |
code *hole() { return here++; } | |
void fix(code *src, code *dst) { *src = dst-src; } /* missing overflow check */ | |
void c(node *x) | |
{ code *p1, *p2; | |
switch (x->kind) | |
{ case VAR : g(IFETCH); g(x->val); break; | |
case CST : g(IPUSH); g(x->val); break; | |
case ADD : c(x->o1); c(x->o2); g(IADD); break; | |
case SUB : c(x->o1); c(x->o2); g(ISUB); break; | |
case LT : c(x->o1); c(x->o2); g(ILT); break; | |
case SET : c(x->o2); g(ISTORE); g(x->o1->val); break; | |
case IF1 : c(x->o1); g(JZ); p1=hole(); c(x->o2); fix(p1,here); break; | |
case IF2 : c(x->o1); g(JZ); p1=hole(); c(x->o2); g(JMP); p2=hole(); | |
fix(p1,here); c(x->o3); fix(p2,here); break; | |
case WHILE: p1=here; c(x->o1); g(JZ); p2=hole(); c(x->o2); | |
g(JMP); fix(hole(),p1); fix(p2,here); break; | |
case DO : p1=here; c(x->o1); c(x->o2); g(JNZ); fix(hole(),p1); break; | |
case EMPTY: break; | |
case SEQ : c(x->o1); c(x->o2); break; | |
case EXPR : c(x->o1); g(IPOP); break; | |
case PROG : c(x->o1); g(HALT); break; | |
} | |
} | |
/*---------------------------------------------------------------------------*/ | |
/* Virtual machine. */ | |
int globals[26]; | |
void run() | |
{ int stack[1000], *sp = stack; | |
code *pc = object; | |
again: switch (*pc++) | |
{ case IFETCH: *sp++ = globals[*pc++]; goto again; | |
case ISTORE: globals[*pc++] = sp[-1]; goto again; | |
case IPUSH : *sp++ = *pc++; goto again; | |
case IPOP : --sp; goto again; | |
case IADD : sp[-2] = sp[-2] + sp[-1]; --sp; goto again; | |
case ISUB : sp[-2] = sp[-2] - sp[-1]; --sp; goto again; | |
case ILT : sp[-2] = sp[-2] < sp[-1]; --sp; goto again; | |
case JMP : pc += *pc; goto again; | |
case JZ : if (*--sp == 0) pc += *pc; else pc++; goto again; | |
case JNZ : if (*--sp != 0) pc += *pc; else pc++; goto again; | |
} | |
} | |
/*---------------------------------------------------------------------------*/ | |
/* Main program. */ | |
int main() | |
{ int i; | |
c(program()); | |
for (i=0; i<26; i++) | |
globals[i] = 0; | |
run(); | |
for (i=0; i<26; i++) | |
if (globals[i] != 0) | |
printf("%c = %d\n", 'a'+i, globals[i]); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you
I'm implementing TinyC for a class project! It's proven to be a good place to start iterating from. I want to make it more expressive, and I was wondering: what extensions do you recommend I build on top of it? I was thinking of functions and user-defined variables, for a start. Probably input handling of some sort, as well.
Observations
I found a bug in the spec, I think. Take a look at this:
When you have
"{" { <statement> } "}"
, you seem to be only capturing one statement per scope. If that's right, I think it should be something like"{" { <statement> }+ "}"
or"{" { <statement> }* "}"
instead :)