Created
August 24, 2025 22:07
-
-
Save Chubek/da5524d9c39b17f9e7114c8a9d3b1614 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <ctype.h> | |
| #include <stdbool.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <uchar.h> | |
| #define FNV_32_PRIME 0x1000193U | |
| #define FNV_32_OFFSET_BASIS 0x811c9dc5U | |
| #define TOP_CHAR INPUT_STREAM[INPTR] | |
| #define PEEK_CHAR INPUT_STREAM[INPTR + 1] | |
| #define PEEK_LOWER tolower (PEEK_CHAR) | |
| #define NUM_TOKEN_SLOTS (1 << 14) | |
| #define Token \ | |
| struct \ | |
| { \ | |
| int toktype; \ | |
| const Value_t *lexeme; \ | |
| } | |
| #define Symbol \ | |
| struct \ | |
| { \ | |
| const char *key; \ | |
| const Value_t value; \ | |
| bool reserved; \ | |
| } | |
| #define Value \ | |
| struct \ | |
| { \ | |
| enum \ | |
| { \ | |
| VAL_String, \ | |
| VAL_Integer, \ | |
| VAL_Rational, \ | |
| VAL_Char, \ | |
| VAL_Id, \ | |
| } type; \ | |
| union \ | |
| { \ | |
| intmax_t as_integer; \ | |
| double as_real; \ | |
| const char32_t *as_string; \ | |
| char32_t as_char; \ | |
| const char *as_ident; \ | |
| Keyword_t as_kw; \ | |
| }; \ | |
| } | |
| typedef Value Value_t; | |
| typedef Token Token_t; | |
| typedef Symbol Symbol_t; | |
| static Token_t *TOKEN_STREAM = NULL; | |
| static Symbol_t *SYMBOL_TABLE = NULL; | |
| static const char *INPUT_STREAM = NULL; | |
| static unsigned INLEN = 0; | |
| static unsigned SYMSZ = 0; | |
| static unsigned TSPTR = 0; | |
| static unsigned INPTR = 0; | |
| static inline uint32_t | |
| fnv1a_hash32 (const char *sym) | |
| { | |
| uint32_t h = FNV_32_OFFSET_BASIS; | |
| char c = 0; | |
| while ((c = *sym++)) | |
| { | |
| h ^= c; | |
| h *= FNV_32_PRIME; | |
| } | |
| return h % SYMSZE; | |
| } | |
| static inline void | |
| insert_str_token (const char32_t *lexeme) | |
| { | |
| TOKEN_STREAM[TSPTR].toktype = TOK_String; | |
| TOKEN_STREAM[TSPTR].lexeme.as_string = lexeme; | |
| TSPTR++; | |
| } | |
| static inline void | |
| insert_int_token (intmax_t lexeme) | |
| { | |
| TOKEN_STREAM[TSPTR].toktype = TOK_Integer; | |
| TOKEN_STREAM[TSPTR].lexeme.as_integer = lexeme; | |
| TSPTR++; | |
| } | |
| static inline void | |
| insert_char_token (char32_t lexeme) | |
| { | |
| TOKEN_STREAM[TSPTR].toktype = TOK_Char; | |
| TOKEN_STREAM[TSPTR].lexeme.as_char = lexeme; | |
| TSPTR++; | |
| } | |
| static inline void | |
| insert_real_token (double lexeme) | |
| { | |
| TOKEN_STREAM[TSPTR].toktype = TOK_Real; | |
| TOKEN_STREAM[TSPTR].lexeme.as_real = lexeme; | |
| TSPTR++; | |
| } | |
| static inline void | |
| insert_ident_token (const char *lexeme) | |
| { | |
| TOKEN_STREAM[TSPTR].toktype = TOK_Ident; | |
| TOKEN_STREAM[TSPTR].lexeme.as_ident = lexeme; | |
| TSPTR++; | |
| } | |
| static inline void | |
| insert_kw_token (Keyword_t lexeme) | |
| { | |
| TOKEN_STREAM[TSPTR].toktype = TOK_Keyword; | |
| TOKEN_STREAM[TSPTR].lexeme.as_kw = lexeme; | |
| TSPTR++; | |
| } | |
| void | |
| cleanup_scanner (void) | |
| { | |
| free (TOKEN_STREAM); | |
| free (INPUT_STREAM); | |
| free (SYMBOL_TABLE); | |
| } | |
| void | |
| scan_program (void) | |
| { | |
| atexit (cleanup_scanner); | |
| TOKEN_STREAM = calloc (NUM_TOKEN_SLOTS, sizeof (Token_t)); | |
| init_symtbl (&SYMBOL_TABLE, &SYMSZ); | |
| handle_input (&INPUT_STREAM, &INLEN); | |
| TSPTR = 0; | |
| INPTR = 0; | |
| while (INPTR < INLEN && TOP_CHAR) | |
| { | |
| while (isblank (TOP_CHAR)) | |
| INPTR++; | |
| if (isalpha (TOP_CHAR) || TOP_CHAR == '_') | |
| { | |
| const char *lexeme = lex_ident (); | |
| uint32_t h = fnv1a_hash32 (lexeme); | |
| if (SYMBOL_TABLE[h].reserved) | |
| insert_kw_token (SYMBOL_TABLE[h].value.as_kw); | |
| else if (SYMBOL_TABLE[h].key != NULL) | |
| { | |
| insert_ident_token (SYMBOL_TABLE[h].key); | |
| free (lexeme); | |
| } | |
| else | |
| { | |
| SYMBOL_TABLE[h].key = lexeme; | |
| SYMBOL_TABLE[h].value.as_ident = NULL; | |
| SYMBOL_TABLE[h].reserved = false; | |
| insert_ident_token (lexeme); | |
| } | |
| } | |
| else if (isdigit (TOP_CHAR)) | |
| { | |
| if (PEEK_LOWER == 'x') | |
| { | |
| INPTR += 2; | |
| intmax_t lexeme = lex_hex (); | |
| insert_int_token (lexeme); | |
| } | |
| else if (PEEK_LOWER == 'o') | |
| { | |
| INPTR += 2; | |
| intmax_t lexeme = lex_oct (); | |
| insert_int_token (lexeme); | |
| } | |
| else if (PEEK_LOWER == 'b') | |
| { | |
| INPTR += 2; | |
| intmax_t lexeme = lex_bin (); | |
| insert_int_token (lexeme); | |
| } | |
| else | |
| { | |
| const char *lexeme = lex_number (); | |
| if (strchr (lexeme, '.') || strchr (lexeme, 'e') | |
| || strchr (lexeme, 'E')) | |
| { | |
| const double lexeme = lex_real (); | |
| insert_real_token (lexeme); | |
| } | |
| else | |
| { | |
| intmax_t lexeme = lex_integer (); | |
| insert_int_token (lexeme); | |
| } | |
| free (lexeme); | |
| } | |
| } | |
| else if (TOP_CHAR == '\'') | |
| { | |
| INPTR++; | |
| char32_t lexeme = lex_char (); | |
| INPTR++; | |
| insert_char_token (lexeme); | |
| } | |
| else if (TOP_CHAR == '"') | |
| { | |
| INPTR++; | |
| const char32_t *lexeme = lex_string (); | |
| INPTR++; | |
| uint32_t h = fnv1a_hash32 ((const char *)lexeme); | |
| if (SYMBOL_TALBE[h].key == NULL) | |
| { | |
| SYMBOL_TABLE[h].key = (const char *)lexeme; | |
| SYMBOL_TABLE[h].value.as_string = lexeme; | |
| SYMBOL_TABLE[h].reserved = false; | |
| insert_str_token (lexeme); | |
| } | |
| else | |
| { | |
| insert_str_token (SYMBOL_TABLE[h].value.as_string); | |
| free (lexeme); | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment