Last active
June 23, 2022 12:56
-
-
Save Silva97/3b967add0dd523e30567892fbb50428e to your computer and use it in GitHub Desktop.
Example of lexical analysis in C language.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Example by Luiz Felipe (Silva97) | |
* | |
* It's just an example. Don't consider it a final code and DON'T write | |
* all your code on a unique module, please. | |
* | |
* Tip: Use a struct to manipulate translate units instead of use only a | |
* FILE pointer ;) | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <ctype.h> | |
typedef enum | |
{ | |
TK_INTEGER = 0, | |
TK_OPERATOR, | |
} token_type_t; | |
typedef struct token_s | |
{ | |
struct token_s *next; | |
struct token_s *prev; | |
char *translate_unit; | |
unsigned int line; | |
unsigned int column; | |
token_type_t type; | |
char *lexeme; | |
} token_t; | |
void show_tokens(token_t *tokens); | |
token_t *lexer(char *filename, FILE *tu); | |
char *parse_literal_int(int first_digit, FILE *tu); | |
int main(int argc, char **argv) | |
{ | |
if (argc < 2) | |
{ | |
fputs("Usage: ./lextest <filename>", stderr); | |
return EXIT_FAILURE; | |
} | |
FILE *translate_unit = fopen(argv[1], "r"); | |
if (!translate_unit) | |
{ | |
perror("Unable to open file."); | |
return EXIT_FAILURE; | |
} | |
token_t *tokens = lexer(argv[1], translate_unit); | |
if (!tokens) | |
{ | |
return EXIT_FAILURE; | |
} | |
show_tokens(tokens); | |
} | |
const char *tk_typename[] = { | |
[TK_INTEGER] = "integer", | |
[TK_OPERATOR] = "operator", | |
}; | |
void show_tokens(token_t *tokens) | |
{ | |
token_t *current = tokens; | |
while (current) | |
{ | |
printf("- lexeme: `%s' | type: %s\n", current->lexeme, tk_typename[current->type]); | |
current = current->next; | |
} | |
} | |
// ----------------------- | |
#define SET_TOKEN(type_value, lexeme_name) \ | |
type = type_value; \ | |
lexeme = lexeme_name; | |
token_t *lexer(char *filename, FILE *tu) | |
{ | |
token_type_t type; | |
int input; | |
unsigned int line = 1; | |
unsigned int column = 1; | |
char *lexeme; | |
token_t *current_token = NULL; | |
token_t *first_token = NULL; | |
while ((input = fgetc(tu)) != EOF) | |
{ | |
while (isspace(input)) | |
{ | |
// CAUTION here: The line break is really \n? It's not true on all text formats. | |
if (input == '\n') | |
{ | |
line++; | |
column = 1; | |
} | |
else | |
{ | |
column++; | |
} | |
input = fgetc(tu); | |
} | |
switch (input) | |
{ | |
case EOF: | |
return first_token; | |
case '+': | |
SET_TOKEN(TK_OPERATOR, "+"); | |
break; | |
case '-': | |
SET_TOKEN(TK_OPERATOR, "-"); | |
break; | |
case '0' ... '9': // range with ... is a GCC extension | |
SET_TOKEN(TK_INTEGER, parse_literal_int(input, tu)); | |
break; | |
default: | |
fprintf(stderr, "Syntactic error: Character '%c' on %s:%d:%d is invalid.\n", input, filename, line, column); | |
// CAUTION here: Memory leak because we doesn't free the allocated tokens. | |
return NULL; | |
break; | |
} | |
token_t *new_token = malloc(sizeof *new_token); | |
new_token->type = type; | |
new_token->lexeme = lexeme; | |
new_token->translate_unit = filename; | |
new_token->line = line; | |
new_token->column = column; | |
new_token->next = NULL; | |
new_token->prev = current_token; | |
if (!first_token) | |
{ | |
first_token = new_token; | |
} | |
else | |
{ | |
current_token->next = new_token; | |
} | |
current_token = new_token; | |
} | |
return first_token; | |
} | |
#define MAX_DIGITS 16 | |
char *parse_literal_int(int first_digit, FILE *tu) | |
{ | |
char *digits = malloc(MAX_DIGITS + 1); | |
char *current_char = digits; | |
int input; | |
*current_char++ = first_digit; | |
for (int i = 1; isdigit((input = fgetc(tu))); i++) | |
{ | |
if (i > MAX_DIGITS) | |
{ | |
fputs("Literal number exceeded maximum size.\n", stderr); | |
exit(EXIT_FAILURE); | |
} | |
*current_char++ = input; | |
} | |
if (input != EOF) | |
{ | |
ungetc(input, tu); | |
} | |
*current_char = '\0'; | |
return digits; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment