Silva97 · June 23, 2022 12:56
diff --git a/lexer-example.c b/lexer-example.c
 /**
 * Example by Luiz Felipe (Silva97)
 *
 * It's just an example. Don't consider it a final code and DON'T write
 * all your code on a unique module, please.
 *
 * Tip: Use a struct to manipulate translate units instead of use only a
 *      FILE pointer ;)
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>

 typedef enum
 {
  TK_INTEGER = 0,
  TK_OPERATOR,
 } token_type_t;

 typedef struct token_s
 {
  struct token_s *next;
  struct token_s *prev;
  char *translate_unit;
  unsigned int line;
  unsigned int column;
  token_type_t type;
  char *lexeme;
 } token_t;

 void show_tokens(token_t *tokens);
 token_t *lexer(char *filename, FILE *tu);
 char *parse_literal_int(int first_digit, FILE *tu);

 int main(int argc, char **argv)
 {
  if (argc < 2)
  {
    fputs("Usage: ./lextest <filename>", stderr);
    return EXIT_FAILURE;
  }

  FILE *translate_unit = fopen(argv[1], "r");
  if (!translate_unit)
  {
    perror("Unable to open file.");
    return EXIT_FAILURE;
  }

  token_t *tokens = lexer(argv[1], translate_unit);
  if (!tokens)
  {
    return EXIT_FAILURE;
  }

  show_tokens(tokens);
 }

 const char *tk_typename[] = {
    [TK_INTEGER] = "integer",
    [TK_OPERATOR] = "operator",
 };

 void show_tokens(token_t *tokens)
 {
  token_t *current = tokens;

  while (current)
  {
    printf("- lexeme: `%s' | type: %s\n", current->lexeme, tk_typename[current->type]);
    current = current->next;
  }
 }

 // -----------------------

 #define SET_TOKEN(type_value, lexeme_name) \
  type = type_value;                       \
  lexeme = lexeme_name;

 token_t *lexer(char *filename, FILE *tu)
 {
  token_type_t type;
  int input;
  unsigned int line = 1;
  unsigned int column = 1;
  char *lexeme;
  token_t *current_token = NULL;
  token_t *first_token = NULL;

  while ((input = fgetc(tu)) != EOF)
  {
    while (isspace(input))
    {
      // CAUTION here: The line break is really \n? It's not true on all text formats.
      if (input == '\n')
      {
        line++;
        column = 1;
      }
      else
      {
        column++;
      }

      input = fgetc(tu);
    }

    switch (input)
    {
    case EOF:
      return first_token;
    case '+':
      SET_TOKEN(TK_OPERATOR, "+");
      break;
    case '-':
      SET_TOKEN(TK_OPERATOR, "-");
      break;
    case '0' ... '9': // range with ... is a GCC extension
      SET_TOKEN(TK_INTEGER, parse_literal_int(input, tu));
      break;
    default:
      fprintf(stderr, "Syntactic error: Character '%c' on %s:%d:%d is invalid.\n", input, filename, line, column);
      // CAUTION here: Memory leak because we doesn't free the allocated tokens.
      return NULL;
      break;
    }

    token_t *new_token = malloc(sizeof *new_token);
    new_token->type = type;
    new_token->lexeme = lexeme;
    new_token->translate_unit = filename;
    new_token->line = line;
    new_token->column = column;
    new_token->next = NULL;
    new_token->prev = current_token;

    if (!first_token)
    {
      first_token = new_token;
    }
    else
    {
      current_token->next = new_token;
    }

    current_token = new_token;
  }

  return first_token;
 }

 #define MAX_DIGITS 16
 char *parse_literal_int(int first_digit, FILE *tu)
 {
  char *digits = malloc(MAX_DIGITS + 1);
  char *current_char = digits;
  int input;

  *current_char++ = first_digit;

  for (int i = 1; isdigit((input = fgetc(tu))); i++)
  {
    if (i > MAX_DIGITS)
    {
      fputs("Literal number exceeded maximum size.\n", stderr);
      exit(EXIT_FAILURE);
    }

    *current_char++ = input;
  }

  if (input != EOF)
  {
    ungetc(input, tu);
  }

  *current_char = '\0';
  return digits;
 }
	/**
	* Example by Luiz Felipe (Silva97)
	*
	* It's just an example. Don't consider it a final code and DON'T write
	* all your code on a unique module, please.
	*
	* Tip: Use a struct to manipulate translate units instead of use only a
	* FILE pointer ;)
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <ctype.h>

	typedef enum
	{
	TK_INTEGER = 0,
	TK_OPERATOR,
	} token_type_t;

	typedef struct token_s
	{
	struct token_s *next;
	struct token_s *prev;
	char *translate_unit;
	unsigned int line;
	unsigned int column;
	token_type_t type;
	char *lexeme;
	} token_t;

	void show_tokens(token_t *tokens);
	token_t lexer(char filename, FILE *tu);
	char parse_literal_int(int first_digit, FILE tu);

	int main(int argc, char **argv)
	{
	if (argc < 2)
	{
	fputs("Usage: ./lextest <filename>", stderr);
	return EXIT_FAILURE;
	}

	FILE *translate_unit = fopen(argv[1], "r");
	if (!translate_unit)
	{
	perror("Unable to open file.");
	return EXIT_FAILURE;
	}

	token_t *tokens = lexer(argv[1], translate_unit);
	if (!tokens)
	{
	return EXIT_FAILURE;
	}

	show_tokens(tokens);
	}

	const char *tk_typename[] = {
	[TK_INTEGER] = "integer",
	[TK_OPERATOR] = "operator",
	};

	void show_tokens(token_t *tokens)
	{
	token_t *current = tokens;

	while (current)
	{
	printf("- lexeme: `%s' \| type: %s\n", current->lexeme, tk_typename[current->type]);
	current = current->next;
	}
	}

	// -----------------------

	#define SET_TOKEN(type_value, lexeme_name) \
	type = type_value; \
	lexeme = lexeme_name;

	token_t lexer(char filename, FILE *tu)
	{
	token_type_t type;
	int input;
	unsigned int line = 1;
	unsigned int column = 1;
	char *lexeme;
	token_t *current_token = NULL;
	token_t *first_token = NULL;

	while ((input = fgetc(tu)) != EOF)
	{
	while (isspace(input))
	{
	// CAUTION here: The line break is really \n? It's not true on all text formats.
	if (input == '\n')
	{
	line++;
	column = 1;
	}
	else
	{
	column++;
	}

	input = fgetc(tu);
	}

	switch (input)
	{
	case EOF:
	return first_token;
	case '+':
	SET_TOKEN(TK_OPERATOR, "+");
	break;
	case '-':
	SET_TOKEN(TK_OPERATOR, "-");
	break;
	case '0' ... '9': // range with ... is a GCC extension
	SET_TOKEN(TK_INTEGER, parse_literal_int(input, tu));
	break;
	default:
	fprintf(stderr, "Syntactic error: Character '%c' on %s:%d:%d is invalid.\n", input, filename, line, column);
	// CAUTION here: Memory leak because we doesn't free the allocated tokens.
	return NULL;
	break;
	}

	token_t new_token = malloc(sizeof new_token);
	new_token->type = type;
	new_token->lexeme = lexeme;
	new_token->translate_unit = filename;
	new_token->line = line;
	new_token->column = column;
	new_token->next = NULL;
	new_token->prev = current_token;

	if (!first_token)
	{
	first_token = new_token;
	}
	else
	{
	current_token->next = new_token;
	}

	current_token = new_token;
	}

	return first_token;
	}

	#define MAX_DIGITS 16
	char parse_literal_int(int first_digit, FILE tu)
	{
	char *digits = malloc(MAX_DIGITS + 1);
	char *current_char = digits;
	int input;

	*current_char++ = first_digit;

	for (int i = 1; isdigit((input = fgetc(tu))); i++)
	{
	if (i > MAX_DIGITS)
	{
	fputs("Literal number exceeded maximum size.\n", stderr);
	exit(EXIT_FAILURE);
	}

	*current_char++ = input;
	}

	if (input != EOF)
	{
	ungetc(input, tu);
	}

	*current_char = '\0';
	return digits;
	}