Created
July 18, 2022 02:51
-
-
Save pmttavara/f89a8226f310e7f59e94e303d02c2cf1 to your computer and use it in GitHub Desktop.
Very slow (300 MB/s) syntax highlighter for C++ - WIP from 2019; pulled verbatim from Eden text editor (dead project)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "parsing.h" | |
#include "buffer.h" | |
#include <ch_stl/time.h> | |
namespace parsing { | |
// This is a column-reduction table to map 128 ASCII values to a 11-input space. | |
// The values in this table are premultiplied with the number of DFA states | |
// to save one multiply when indexing the state transition table. | |
#define P (DFA_NUM_STATES) | |
static const u8 char_type[] = { | |
WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, | |
WHITE*P, WHITE*P, NEWLINE*P, WHITE*P, WHITE*P, NEWLINE*P, WHITE*P, WHITE*P, | |
WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, | |
WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, WHITE*P, | |
WHITE*P, OP*P, DOUBLEQUOTE*P, OP*P, IDENT*P, OP*P, OP*P, SINGLEQUOTE*P, | |
OP*P, OP*P, STAR*P, OP*P, OP*P, OP*P, OP*P, SLASH*P, | |
DIGIT*P, DIGIT*P, DIGIT*P, DIGIT*P, DIGIT*P, DIGIT*P, DIGIT*P, DIGIT*P, | |
DIGIT*P, DIGIT*P, OP*P, OP*P, OP*P, OP*P, OP*P, OP*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, OP*P, BS*P, OP*P, OP*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, OP*P, OP*P, OP*P, OP*P, WHITE*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, IDENT*P, | |
}; | |
// This is a state transition table for the deterministic finite | |
// automaton (DFA) lexer. Overtop this DFA runs a block-comment scanner. | |
// This table is written in column-major format, because it allows for a | |
// premultiplied column index as mentioned above. | |
static const u8 lex_table[] = { | |
// WHITE | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_WHITE, // DFA_WHITE | |
DFA_WHITE, // DFA_WHITE_BS | |
DFA_NEWLINE, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_WHITE, // DFA_SLASH | |
DFA_WHITE, // DFA_IDENT | |
DFA_WHITE, // DFA_OP | |
DFA_WHITE, // DFA_OP2 | |
DFA_WHITE, // DFA_NUMLIT | |
// NEWLINE | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_NEWLINE, // DFA_LINE_COMMENT | |
DFA_NEWLINE, // DFA_WHITE | |
DFA_WHITE, // DFA_WHITE_BS | |
DFA_NEWLINE, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_NEWLINE, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_NEWLINE, // DFA_SLASH | |
DFA_NEWLINE, // DFA_IDENT | |
DFA_NEWLINE, // DFA_OP | |
DFA_NEWLINE, // DFA_OP2 | |
DFA_NEWLINE, // DFA_NUMLIT | |
// IDENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_IDENT, // DFA_WHITE | |
DFA_IDENT, // DFA_WHITE_BS | |
DFA_IDENT, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_IDENT, // DFA_SLASH | |
DFA_IDENT, // DFA_IDENT | |
DFA_IDENT, // DFA_OP | |
DFA_IDENT, // DFA_OP2 | |
DFA_NUMLIT, // DFA_NUMLIT | |
// DOUBLEQUOTE | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_STRINGLIT, // DFA_WHITE | |
DFA_STRINGLIT, // DFA_WHITE_BS | |
DFA_STRINGLIT, // DFA_NEWLINE | |
DFA_WHITE, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_STRINGLIT, // DFA_SLASH | |
DFA_STRINGLIT, // DFA_IDENT | |
DFA_STRINGLIT, // DFA_OP | |
DFA_STRINGLIT, // DFA_OP2 | |
DFA_STRINGLIT, // DFA_NUMLIT | |
// SINGLEQUOTE | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_CHARLIT, // DFA_WHITE | |
DFA_CHARLIT, // DFA_WHITE_BS | |
DFA_CHARLIT, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_WHITE, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_CHARLIT, // DFA_SLASH | |
DFA_CHARLIT, // DFA_IDENT | |
DFA_CHARLIT, // DFA_OP | |
DFA_CHARLIT, // DFA_OP2 | |
DFA_NUMLIT, // DFA_NUMLIT | |
// DIGIT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_NUMLIT, // DFA_WHITE | |
DFA_NUMLIT, // DFA_WHITE_BS | |
DFA_NUMLIT, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_NUMLIT, // DFA_SLASH | |
DFA_IDENT, // DFA_IDENT | |
DFA_NUMLIT, // DFA_OP | |
DFA_NUMLIT, // DFA_OP2 | |
DFA_NUMLIT, // DFA_NUMLIT | |
// SLASH | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_WHITE, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_SLASH, // DFA_WHITE | |
DFA_SLASH, // DFA_WHITE_BS | |
DFA_SLASH, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_LINE_COMMENT, // DFA_SLASH | |
DFA_SLASH, // DFA_IDENT | |
DFA_SLASH, // DFA_OP | |
DFA_SLASH, // DFA_OP2 | |
DFA_SLASH, // DFA_NUMLIT | |
// STAR | |
DFA_BLOCK_COMMENT_STAR, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT_STAR, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_OP, // DFA_WHITE | |
DFA_OP, // DFA_WHITE_BS | |
DFA_OP, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_BLOCK_COMMENT, // DFA_SLASH | |
DFA_OP, // DFA_IDENT | |
DFA_OP2, // DFA_OP | |
DFA_OP, // DFA_OP2 | |
DFA_OP, // DFA_NUMLIT | |
// BS | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_WHITE_BS, // DFA_WHITE | |
DFA_WHITE_BS, // DFA_WHITE_BS | |
DFA_WHITE_BS, // DFA_NEWLINE | |
DFA_STRINGLIT_BS, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT_BS, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_WHITE_BS, // DFA_SLASH | |
DFA_WHITE_BS, // DFA_IDENT | |
DFA_WHITE_BS, // DFA_OP | |
DFA_WHITE_BS, // DFA_OP2 | |
DFA_WHITE_BS, // DFA_NUMLIT | |
// OP | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT | |
DFA_BLOCK_COMMENT, // DFA_BLOCK_COMMENT_STAR | |
DFA_LINE_COMMENT, // DFA_LINE_COMMENT | |
DFA_OP, // DFA_WHITE | |
DFA_OP, // DFA_WHITE_BS | |
DFA_OP, // DFA_NEWLINE | |
DFA_STRINGLIT, // DFA_STRINGLIT | |
DFA_STRINGLIT, // DFA_STRINGLIT_BS | |
DFA_CHARLIT, // DFA_CHARLIT | |
DFA_CHARLIT, // DFA_CHARLIT_BS | |
DFA_OP, // DFA_SLASH | |
DFA_OP, // DFA_IDENT | |
DFA_OP2, // DFA_OP | |
DFA_OP, // DFA_OP2 | |
DFA_OP, // DFA_NUMLIT | |
}; | |
u8 lex(u8 dfa, const u8* p, const u8* const end, Lexeme*& lexemes) { | |
while (p < end) { | |
u8 new_dfa = lex_table[dfa + char_type[*p]]; | |
if (new_dfa != dfa) { | |
lexemes->i = p; | |
lexemes->dfa = (Lex_Dfa)new_dfa; | |
lexemes->cached_first = *p; | |
lexemes++; | |
dfa = new_dfa; | |
} | |
p++; | |
} | |
return dfa; | |
} | |
const u8* temp_parser_gap; | |
u64 temp_parser_gap_size; | |
static const u8 lexeme_sentinel_buffer[16]; | |
u64 toklen(const Lexeme* l) { | |
const u8* next_i = l[1].i; | |
if (next_i == lexeme_sentinel_buffer) next_i = l[2].i; | |
if (next_i == temp_parser_gap + temp_parser_gap_size) next_i = temp_parser_gap; | |
return next_i - l->i; | |
} | |
//bool nested = false; // JUST for debugging | |
void parse(Lexeme* l, Lexeme* end); | |
#define KW_CHUNK_(s, offset, I) \ | |
((I) + offset < (sizeof(s) - 1) \ | |
? (u64)(s)[(I) + offset] << (u64)((I)*8ull) \ | |
: 0ull) | |
#define KW_CHUNK__(s, offset) \ | |
(KW_CHUNK_(s, offset, 0) | KW_CHUNK_(s, offset, 1) | \ | |
KW_CHUNK_(s, offset, 2) | KW_CHUNK_(s, offset, 3) | \ | |
KW_CHUNK_(s, offset, 4) | KW_CHUNK_(s, offset, 5) | \ | |
KW_CHUNK_(s, offset, 6) | KW_CHUNK_(s, offset, 7)) | |
#define KW_CHUNK0(s) KW_CHUNK__(s, 0) | |
#define KW_CHUNK1(s) KW_CHUNK__(s, 8) | |
#define KW_CHUNK(x) KW_CHUNK0(x) | |
// Loading byte-wise so that we don't encroach on the end of the buffer. | |
// This is a whole bunch of unnecessary typing that I shouldn't need to write, | |
// but I have no guarantees that it will actually coalesce the reads properly otherwise. | |
// In fact, even with all THIS you can't guarantee it, you need something even gnarlier | |
// to get the right reads. But I trust that release build will merge these. (Maybe I shouldn't.) | |
#define Load1(x) (u64)x[0] | |
#define Load2(x) Load1(x) | (u64)x[1] << 8 | |
#define Load3(x) Load2(x) | (u64)x[2] << 16 | |
#define Load4(x) Load3(x) | (u64)x[3] << 24 | |
#define Load5(x) Load4(x) | (u64)x[4] << 32ull | |
#define Load6(x) Load5(x) | (u64)x[5] << 40ull | |
#define Load7(x) Load6(x) | (u64)x[6] << 48ull | |
#define Load8(x) Load7(x) | (u64)x[7] << 56ull | |
// This is a nasty macro to be able to switch on tokens in a fast and intelligent way. | |
// Obviously C++ sucks, so there's no way to switch on strings smartly, | |
// so I have to construct this monstrosity of a macro to get the desired behaviour. | |
// Still, it's got a speed advantage, and speed is paramount. | |
#define switch_on_token(l_, fordef, for2, for3, for4, for5, for6, for7, for8) \ | |
do { \ | |
const Lexeme *switch_on_token_lexeme = (l_); \ | |
const u8* swchp = switch_on_token_lexeme->i; \ | |
switch (u64 len = toklen(switch_on_token_lexeme)) { \ | |
default: def: { fordef; } break; \ | |
case 2: switch (Load2(swchp)) { default: goto def; { for2; } } break; \ | |
case 3: switch (Load3(swchp)) { default: goto def; { for3; } } break; \ | |
case 4: switch (Load4(swchp)) { default: goto def; { for4; } } break; \ | |
case 5: switch (Load5(swchp)) { default: goto def; { for5; } } break; \ | |
case 6: switch (Load6(swchp)) { default: goto def; { for6; } } break; \ | |
case 7: switch (Load7(swchp)) { default: goto def; { for7; } } break; \ | |
case 8: switch (Load8(swchp)) { default: goto def; { for8; } } break; \ | |
} \ | |
} while (0); | |
#include "parsing_cpp_keywords.h" | |
bool is_keyword(const Lexeme* l) { | |
#define IS_KEYWORD_CASE(name) case KW_CHUNK(#name): return true; | |
switch_on_token(l, return false, | |
CPP_KEYWORDS_2(IS_KEYWORD_CASE), | |
CPP_KEYWORDS_3(IS_KEYWORD_CASE), | |
CPP_KEYWORDS_4(IS_KEYWORD_CASE), | |
CPP_KEYWORDS_5(IS_KEYWORD_CASE), | |
CPP_KEYWORDS_6(IS_KEYWORD_CASE), | |
CPP_KEYWORDS_7(IS_KEYWORD_CASE), | |
CPP_KEYWORDS_8(IS_KEYWORD_CASE)); | |
} | |
static Lexeme* skip_comments_in_line(Lexeme* l, Lexeme* end) { | |
while (l < end && (l->dfa < DFA_NEWLINE || l->dfa == DFA_SLASH && l[1].dfa <= DFA_LINE_COMMENT)) l++; | |
return l; | |
} | |
static Lexeme* parse_preproc(Lexeme* l, Lexeme* end) { | |
l->dfa = DFA_PREPROC; | |
l++; | |
l = skip_comments_in_line(l, end); | |
if (l->dfa == DFA_IDENT) { | |
Lexeme* directive = l; | |
l->dfa = DFA_PREPROC; | |
l++; | |
l = skip_comments_in_line(l, end); | |
switch_on_token(directive,,,,,, | |
case KW_CHUNK("define"): | |
if (l->dfa == DFA_IDENT) { | |
l->dfa = DFA_MACRO; | |
l++; | |
if (l->c() == '(') { | |
while (l < end && l->dfa != DFA_NEWLINE) { | |
l++; | |
if (l->c() == ')') { | |
l++; | |
break; | |
} | |
} | |
} | |
} | |
break, | |
case KW_CHUNK("include"): | |
if (l->c() == '<') { | |
l->dfa = DFA_STRINGLIT; | |
while (l < end && l->dfa != DFA_NEWLINE) { | |
l->dfa = DFA_STRINGLIT; | |
l++; | |
if (l->c() == '>') { | |
l->dfa = DFA_STRINGLIT; | |
l++; | |
break; | |
} | |
} | |
} | |
break,); | |
} | |
Lexeme* preproc_begin = l; | |
while (l < end && l->dfa != DFA_NEWLINE) l++; | |
//nested = true; | |
parse(preproc_begin, l); | |
//nested = false; | |
return l; | |
} | |
static Lexeme* next_token(Lexeme* l, Lexeme* end) { | |
while (true) { | |
l = skip_comments_in_line(l, end); | |
if (l < end && l->dfa == DFA_NEWLINE) { | |
l++; | |
if (l->c() == '#') { | |
//assert(!nested); | |
l = parse_preproc(l, end); | |
} | |
continue; | |
} | |
break; | |
} | |
return l; | |
} | |
static bool at_token(Lexeme* l, Lexeme* end) { | |
Lexeme* r = skip_comments_in_line(l, end); | |
return r == l && (!(r < end) || r->c() != '#'); | |
} | |
// Brace nesting precedence: {for(;[;{;];)}} | |
// 1. {} - Curlies: Most important | |
// 2. () - STATEMENT parentheses: for(;;) etc. | |
// ; - Semicolon supersedes everything below | |
// 3. () - EXPRESSION/PARAMETER parentheses: call(x, y); int func(int a); etc. | |
// 4. [] - Square brackets. | |
// , - Comma supersedes almost nothing. | |
// 5. <> - Greater than/less than: least important. | |
static Lexeme* parse_stmt(Lexeme* l, Lexeme* end, Lex_Dfa var_name = DFA_IDENT); | |
static Lexeme* parse_expr(Lexeme* l, Lexeme* end); | |
static Lexeme* parse_stmt_braces(Lexeme* l, Lexeme* end) { | |
assert(l < end); | |
assert(l->c() == '{'); | |
l++; | |
l = next_token(l, end); | |
while (l < end) { | |
l = parse_stmt(l, end); | |
if (l->c() == ',' || | |
l->c() == ']' || | |
l->c() == ';' || | |
l->c() == ')') { | |
l++; | |
} | |
if (l->c() == '}') { | |
break; | |
} | |
l = next_token(l, end); | |
} | |
return l; | |
} | |
static Lexeme* parse_expr_braces(Lexeme* l, Lexeme* end) { | |
assert(l < end); | |
assert(l->c() == '{'); | |
l++; | |
l = next_token(l, end); | |
while (l < end) { | |
l = parse_expr(l, end); | |
if (l->c() == ',' || | |
l->c() == ']' || | |
l->c() == ';' || | |
l->c() == ')') { | |
l++; | |
} | |
if (l->c() == '}') { | |
break; | |
} | |
l = next_token(l, end); | |
} | |
return l; | |
} | |
static Lexeme* parse_stmt_parens(Lexeme* l, Lexeme* end) { | |
assert(l < end); | |
assert(l->c() == '('); | |
l++; | |
l = next_token(l, end); | |
while (l < end) { | |
l = parse_stmt(l, end); | |
if (l->c() == ',' || | |
l->c() == ']' || | |
l->c() == ';') { | |
l++; | |
l = next_token(l, end); | |
} | |
if (l->c() == ')' || | |
l->c() == '}') { | |
break; | |
} | |
} | |
return l; | |
} | |
static Lexeme* parse_expr(Lexeme* l, Lexeme* end); | |
static Lexeme* parse_exprs_til_semi(Lexeme* l, Lexeme* end) { | |
//assert(l < end); | |
while (l < end) { | |
l = parse_expr(l, end); | |
if (l->c() == ',' || | |
l->c() == ']') { | |
l++; | |
} | |
if (l->c() == ';' || | |
l->c() == ')' || | |
l->c() == '}') { | |
break; | |
} | |
l = next_token(l, end); | |
} | |
return l; | |
} | |
static Lexeme* parse_exprs_til_comma(Lexeme* l, Lexeme* end) { | |
//assert(l < end); | |
while (l < end) { | |
l = parse_expr(l, end); | |
if (l->c() == ',' || | |
l->c() == ']' || | |
l->c() == ';' || | |
l->c() == ')' || | |
l->c() == '}') { | |
break; | |
} | |
l = next_token(l, end); | |
} | |
return l; | |
} | |
static Lexeme* parse_expr_parens(Lexeme* l, Lexeme* end) { | |
assert(l < end); | |
assert(l->c() == '('); | |
l++; | |
l = next_token(l, end); | |
while (l < end) { | |
l = parse_exprs_til_comma(l, end); | |
if (l->c() == ',') { | |
l++; | |
l = next_token(l, end); | |
} | |
if (l->c() == ']' || | |
l->c() == ';' || | |
l->c() == ')' || | |
l->c() == '}') { | |
break; | |
} | |
} | |
return l; | |
} | |
static Lexeme* parse_expr_sqr(Lexeme* l, Lexeme* end) { | |
assert(l < end); | |
assert(l->c() == '['); | |
l++; | |
l = next_token(l, end); | |
while (l < end) { | |
l = parse_exprs_til_comma(l, end); | |
if (l->c() == ',') { | |
l++; | |
l = next_token(l, end); | |
} | |
if (l->c() == ']' || | |
l->c() == ';' || | |
l->c() == ')' || | |
l->c() == '}') { | |
break; | |
} | |
} | |
return l; | |
} | |
static Lexeme* parse_param(Lexeme* l, Lexeme* end); | |
static Lexeme* parse_params(Lexeme* l, Lexeme* end) { | |
assert(l < end); | |
assert(l->c() == '('); | |
l++; | |
l = next_token(l, end); | |
while (l < end) { | |
l = parse_param(l, end); | |
if (l->c() == ',') { | |
l++; | |
l = next_token(l, end); | |
} | |
if (l->c() == ']' || | |
l->c() == ';' || | |
l->c() == ')' || | |
l->c() == '}') { | |
break; | |
} | |
} | |
return l; | |
} | |
static Lexeme* parse_struct_union(Lexeme* l, Lexeme* end); | |
static Lexeme* parse_expr(Lexeme* l, Lexeme* end) { | |
//assert(l < end); | |
if (!(l < end)) return l; | |
if (l->c() == '#') { | |
if (l[1].c() == '#') { | |
l->dfa = DFA_IDENT; | |
l++; | |
l->dfa = DFA_IDENT; | |
l++; | |
l = next_token(l, end); | |
if (l->dfa == DFA_NUMLIT) { | |
l->dfa = DFA_IDENT; | |
l++; | |
} | |
} else { | |
l->dfa = DFA_PREPROC; | |
l++; | |
} | |
l = next_token(l, end); | |
} | |
switch_on_token(l,,,,, | |
case KW_CHUNK("union"): { | |
l = parse_struct_union(l, end); | |
} break;, | |
case KW_CHUNK("sizeof"): { | |
l++; | |
l = next_token(l, end); | |
l = parse_expr(l, end); | |
} break; | |
case KW_CHUNK("struct"): { | |
l = parse_struct_union(l, end); | |
} break;,,); | |
if (l->c() == '~' || | |
l->c() == '!' || | |
l->c() == '&' || | |
l->c() == '*' || | |
l->c() == '-' || | |
l->c() == '+') { | |
l++; | |
l = next_token(l, end); | |
return parse_expr(l, end); | |
} | |
if (l->c() == '{') { | |
l = parse_expr_braces(l, end); | |
if (l->c() == '}') { | |
l++; | |
l = next_token(l, end); | |
} | |
} else if (l->c() == '(') { | |
l = parse_expr_parens(l, end); | |
if (l->c() == ')') { | |
l++; | |
l = next_token(l, end); | |
} | |
} else if (l->c() == '[') { | |
l = parse_expr_sqr(l, end); | |
if (l->c() == ']') { | |
l++; | |
l = next_token(l, end); | |
} | |
if (l->c() == '(') { // lambda parameter list | |
l = parse_params(l, end); | |
if (l->c() == ')') { | |
l++; | |
l = next_token(l, end); | |
} | |
} | |
if (l->c() == '{') { // lambda body | |
l = parse_stmt_braces(l, end); | |
if (l->c() == '}') { | |
l++; | |
l = next_token(l, end); | |
} | |
} | |
} else if (l->dfa == DFA_IDENT) { | |
Lexeme* ident = l; | |
l++; | |
l = next_token(l, end); | |
if (l->c() == '(') { | |
ident->dfa = DFA_FUNCTION; | |
l = parse_expr_parens(l, end); | |
if (l->c() == ')') { | |
l++; | |
l = next_token(l, end); | |
} | |
} else if (l->c() == '{') { | |
ident->dfa = DFA_TYPE; | |
l = parse_expr_braces(l, end); | |
if (l->c() == '}') { | |
l++; | |
l = next_token(l, end); | |
} | |
} | |
} else if (l->dfa == DFA_NUMLIT) { | |
l++; | |
l = next_token(l, end); | |
} else if (l->dfa == DFA_STRINGLIT) { | |
while (l < end && (l->dfa == DFA_STRINGLIT || l->dfa == DFA_STRINGLIT_BS)) { | |
l++; | |
l = next_token(l, end); | |
// lets us properly parse this: | |
// char*x = "string literal but there's " | |
// #pragma once | |
// "more more more"; | |
} | |
} else if (l->dfa == DFA_CHARLIT) { | |
while (l < end && (l->dfa == DFA_CHARLIT || l->dfa == DFA_CHARLIT_BS)) { | |
l++; | |
l = next_token(l, end); | |
// lets us properly parse this: | |
// char*x = "string literal but there's " | |
// #pragma once | |
// "more more more"; | |
} | |
} | |
if (l->c() == '[' || | |
l->c() == '(') { | |
l = parse_expr(l, end); | |
l = next_token(l, end); | |
} | |
if (l->c() == '%' || | |
l->c() == '^' || | |
l->c() == '&' || | |
l->c() == '*' || | |
l->c() == '-' || | |
l->c() == '=' || | |
l->c() == '+' || | |
l->c() == '|' || | |
l->c() == ':' || | |
l->c() == '<' || | |
l->c() == '.' || | |
l->c() == '>' || | |
l->c() == '/' || | |
l->c() == '?') { | |
l++; | |
l = next_token(l, end); | |
l = parse_expr(l, end); | |
} | |
return l; | |
} | |
static Lexeme* parse_type(Lexeme* l, Lexeme* end) { | |
if (l->dfa == DFA_IDENT) { | |
l->dfa = DFA_TYPE; | |
l++; | |
l = next_token(l, end); | |
if (l->c() == '<') { | |
do { | |
l++; | |
l = next_token(l, end); | |
l = parse_type(l, end); | |
} while (l->c() == ','); | |
if (l->c() == '>') { | |
l++; | |
l = next_token(l, end); | |
} | |
} | |
while (l < end) { | |
if (l->c() == '*' || | |
l->c() == '&') { | |
l->dfa = DFA_TYPE; | |
l++; | |
l = next_token(l, end); | |
} else if (l->c() == '[') { | |
l = parse_expr_sqr(l, end); | |
if (l->c() == ']') { | |
l++; | |
l = next_token(l, end); | |
} | |
} else if (l->c() == '(') { | |
do { | |
l++; | |
l = next_token(l, end); | |
l = parse_type(l, end); | |
} while (l->c() == ','); | |
if (l->c() == ')') { | |
l++; | |
l = next_token(l, end); | |
} | |
} else { | |
break; | |
} | |
if (l->c() == ';' || | |
l->c() == '}' || | |
l->c() == ')' || | |
l->c() == ']' || | |
l->c() == '>') { | |
break; | |
} | |
} | |
} | |
return l; | |
} | |
static Lexeme* parse_param(Lexeme* l, Lexeme* end) { return parse_stmt(l, end, DFA_PARAM); } | |
static Lexeme* parse_if_switch_while_for(Lexeme* l, Lexeme* end) { | |
l++; | |
l = next_token(l, end); | |
if (l->c() == '(') { | |
l = parse_stmt_parens(l, end); | |
if (l->c() == ')') { | |
l++; | |
l = next_token(l, end); | |
} | |
} | |
return parse_stmt(l, end); | |
} | |
static Lexeme* parse_struct_union(Lexeme* l, Lexeme* end) { | |
l++; | |
l = next_token(l, end); | |
if (l->dfa == DFA_IDENT) { | |
l->dfa = DFA_TYPE; | |
l++; | |
l = next_token(l, end); | |
} | |
if (l->c() == '{') { | |
l = parse_stmt_braces(l, end); | |
if (l->c() == '}') { | |
l++; | |
l = next_token(l, end); | |
} | |
} | |
return l; | |
} | |
static Lexeme* parse_using(Lexeme* l, Lexeme* end) { | |
l++; | |
l = next_token(l, end); | |
if (l->dfa == DFA_IDENT) { | |
l->dfa = DFA_TYPE; | |
l++; | |
l = next_token(l, end); | |
} | |
if (l->c() == '=') { | |
l++; | |
l = next_token(l, end); | |
l = parse_type(l, end); | |
} | |
return l; | |
} | |
static Lexeme* parse_stmt(Lexeme* l, Lexeme* end, Lex_Dfa var_name_type) { | |
//if (l->c() == '{') { | |
// return parse_stmt_braces(l, end); | |
//} | |
if (l->dfa != DFA_IDENT) { | |
return parse_exprs_til_semi(l, end); | |
} | |
Lexeme* first = l; | |
switch_on_token(l, | |
{ | |
l->dfa = DFA_TYPE; | |
l++; | |
l = next_token(l, end); | |
if (var_name_type == DFA_IDENT) { | |
// Ad-hoc heuristic: Quickly scan ahead and check if this is definitely an expression. | |
switch (l->c()) { | |
case '~': | |
case '!': | |
case '#': | |
case '%': | |
case '^': | |
case '-': | |
case '+': | |
case '=': | |
case '[': | |
case '{': | |
case ']': | |
case '}': | |
case '|': | |
case ';': | |
case '"': | |
case ',': | |
case '.': | |
case '>': | |
case '/': | |
first->dfa = DFA_IDENT; | |
return parse_exprs_til_semi(l, end); | |
case ':': | |
first->dfa = DFA_LABEL; | |
l++; | |
l = next_token(l, end); | |
return parse_stmt(l, end); | |
} | |
} | |
} break;, | |
case KW_CHUNK("if"): { | |
return parse_if_switch_while_for(l, end); | |
} break; | |
case KW_CHUNK("do"): { | |
l++; | |
l = next_token(l, end); | |
return parse_stmt(l, end); | |
} break;, | |
case KW_CHUNK("for"): { | |
return parse_if_switch_while_for(l, end); | |
} break;, | |
case KW_CHUNK("else"): { | |
l++; | |
l = next_token(l, end); | |
return parse_stmt(l, end); | |
} break;, | |
case KW_CHUNK("union"): { | |
l = parse_struct_union(l, end); | |
} break; | |
case KW_CHUNK("while"): { | |
return parse_if_switch_while_for(l, end); | |
} break; | |
case KW_CHUNK("using"): { | |
return parse_using(l, end); | |
} break;, | |
case KW_CHUNK("return"): { | |
l++; | |
l = next_token(l, end); | |
return parse_exprs_til_semi(l, end); | |
} break; | |
case KW_CHUNK("struct"): { | |
l = parse_struct_union(l, end); | |
} break; | |
case KW_CHUNK("switch"): { | |
return parse_if_switch_while_for(l, end); | |
} break;, | |
case KW_CHUNK("typedef"): { | |
l++; | |
l = next_token(l, end); | |
return parse_stmt(l, end, DFA_TYPE); | |
} break;, | |
); | |
bool is_likely_function = false; | |
more_decls: | |
bool seen_closing_paren = false; | |
int paren_nesting = 0; | |
Lexeme* last = nullptr; | |
Lexeme* func = nullptr; | |
while (l < end && | |
(l->dfa == DFA_IDENT || | |
l->c() == '*' || | |
l->c() == '&' || | |
l->c() == '(' || | |
l->c() == ')' || | |
paren_nesting)) { | |
if (l->c() == '(') { | |
if (seen_closing_paren || func) { | |
if (func) { | |
func->dfa = DFA_FUNCTION; | |
} | |
is_likely_function = true; | |
l = parse_params(l, end); | |
if (l->c() == ')') { | |
l++; | |
l = next_token(l, end); | |
} | |
continue; | |
} else { | |
paren_nesting++; | |
} | |
func = nullptr; | |
} else if (l->c() == ')') { | |
paren_nesting--; | |
if (paren_nesting < 0) break; | |
seen_closing_paren = true; | |
func = nullptr; | |
} else if (l->c() == '[') { | |
l = parse_expr_sqr(l, end); | |
if (l->c() == ']') { | |
l++; | |
l = next_token(l, end); | |
} | |
continue; | |
} else if (l->c() == '*' || | |
l->c() == '&') { | |
l->dfa = DFA_TYPE; | |
} else if (l->dfa == DFA_IDENT) { | |
last = l; | |
func = l; | |
l->dfa = DFA_TYPE; | |
} else if (l->c() == ':' && l[1].c() == ':') { | |
// do nothing | |
} else { | |
while (l < end && paren_nesting && l->c() != ';') { | |
l = parse_exprs_til_semi(l, end); | |
if (l->c() == ')') { | |
paren_nesting--; | |
l++; | |
l = next_token(l, end); | |
} | |
} | |
return l; | |
} | |
l++; | |
l = next_token(l, end); | |
} | |
if (last && last->dfa != DFA_FUNCTION) last->dfa = var_name_type; | |
if (l < end) { | |
if (var_name_type != DFA_PARAM) { | |
//assert(at_token(l, end)); | |
l = next_token(l, end); | |
if (is_likely_function && l->c() == '{') { | |
l = parse_stmt_braces(l, end); | |
if (l->c() == '}') { | |
l++; | |
l = next_token(l, end); | |
} | |
return l; | |
} else { | |
l = parse_exprs_til_comma(l, end); | |
if (l->c() == ',') { | |
l++; | |
l = next_token(l, end); | |
goto more_decls; | |
} | |
} | |
} else { | |
l = parse_exprs_til_comma(l, end); | |
} | |
} | |
return l; | |
} | |
static void parse(Lexeme* l, Lexeme* end) { | |
while (l < end) { | |
//if (l->dfa == DFA_IDENT) { | |
// Lexeme* m = l; | |
// do { | |
// l++; | |
// l = skip_comments_in_line(l, end); | |
// } while (l->dfa == DFA_NEWLINE); | |
// if (l->c() == '(') m->dfa = DFA_FUNCTION; | |
//} else l++; | |
l = next_token(l, end); | |
l = parse_stmt(l, end); | |
assert(l->c() != ','); | |
if (l->c() == ']' || | |
l->c() == ';' || | |
l->c() == ')' || | |
l->c() == '}') { | |
l++; | |
} | |
} | |
} | |
void parse_cpp(Buffer* buf) { | |
if (!buf->syntax_dirty || buf->disable_parse) return; | |
buf->syntax_dirty = false; | |
ch::Gap_Buffer<u8>& b = buf->gap_buffer; | |
usize buffer_count = b.count(); | |
// Three extra lexemes: | |
// One extra lexeme at the front. | |
// One at the back to indicate buffer end to the parser, pointing to safe | |
// scratch data. | |
// One after that pointing to the the real position of the (unreadable) buffer | |
// end, so that identifier lengths can be correctly computed. | |
buf->lexemes.reserve((1 + buffer_count + 1 + 1) - buf->lexemes.allocated); | |
buf->lexemes.count = 1; | |
if (buffer_count) { | |
u8 lexer = DFA_NEWLINE; | |
Lexeme* lex_seeker = buf->lexemes.begin(); | |
{ | |
lex_seeker->i = b.data; | |
lex_seeker->dfa = (Lex_Dfa)lexer; | |
lex_seeker->cached_first = b.data[0]; | |
lex_seeker++; | |
} | |
f64 lex_time = -ch::get_time_in_seconds(); | |
lexer = lex(lexer, b.data, b.gap, lex_seeker); | |
Lexeme* lexeme_at_gap = lex_seeker - 1; | |
lexer = lex(lexer, b.gap + b.gap_size, b.data + b.allocated, lex_seeker); | |
lex_time += ch::get_time_in_seconds(); | |
if (lexeme_at_gap > buf->lexemes.begin() && lexeme_at_gap < lex_seeker) { | |
assert(lexeme_at_gap->i < b.gap); | |
if (lexeme_at_gap + 1 < lex_seeker) { | |
assert(lexeme_at_gap[1].i >= b.gap + b.gap_size); | |
} | |
b.move_gap_to_index(lexeme_at_gap->i - b.data); | |
assert(lexeme_at_gap->i == b.gap); | |
lexeme_at_gap->i += b.gap_size; | |
// lexeme_at_gap->cached_first should definitely not have changed. | |
} | |
{ | |
assert(lex_seeker < buf->lexemes.begin() + buf->lexemes.allocated); | |
lex_seeker->dfa = DFA_NUM_STATES; | |
lex_seeker->i = lexeme_sentinel_buffer; // So the parser can safely read from here. | |
lex_seeker->cached_first = lex_seeker->i[0]; | |
lex_seeker++; | |
assert(lex_seeker < buf->lexemes.begin() + buf->lexemes.allocated); | |
lex_seeker->dfa = DFA_NUM_STATES; | |
lex_seeker->i = b.data + b.allocated; // So the parser knows the real end position. | |
lex_seeker->cached_first = 0; | |
lex_seeker++; | |
} | |
buf->lexemes.count = lex_seeker - buf->lexemes.begin(); | |
temp_parser_gap = b.gap; | |
temp_parser_gap_size = b.gap_size; | |
f64 parse_time = -ch::get_time_in_seconds(); | |
parse(buf->lexemes.begin(), buf->lexemes.end() - 2); | |
parse_time += ch::get_time_in_seconds(); | |
buf->lexemes.end()[-2] = buf->lexemes.end()[-1]; | |
buf->lexemes.count -= 1; | |
buf->lex_time += lex_time; | |
buf->parse_time += parse_time; | |
buf->lex_parse_count++; | |
} | |
} | |
} // namespace parsing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <ch_stl/types.h> | |
struct Buffer; | |
// C++ lexing/parsing tools. | |
namespace parsing { | |
// This is the actual DFA state machine. It's not perfect, | |
// but I (phillip) can get it to run at 13mloc/s on a 3.6 GHz machine | |
// with 1600 MHz DDR3 RAM and, as of writing, YEET's UTF-8 gap buffer. | |
// It does as little as it can; it tries not to *parse* C++, only lex it. | |
// This means it just scans comments and string/number literals (mostly). | |
// It doesn't even recognize keywords, it just treats all identifiers alike. | |
// The tradeoff is that this puts an increased burden of code processing | |
// onto the parser. There is no nesting of any kind. | |
// I don't yet know the optimal ordering here for cache-efficiency, if there is | |
// one; I will have to write some sort of randomizer/permutation-checker. | |
enum Lex_Dfa : u8 { | |
DFA_BLOCK_COMMENT, | |
DFA_BLOCK_COMMENT_STAR, | |
DFA_LINE_COMMENT, | |
DFA_WHITE, | |
DFA_WHITE_BS, | |
DFA_NEWLINE, | |
DFA_STRINGLIT, | |
DFA_STRINGLIT_BS, | |
DFA_CHARLIT, | |
DFA_CHARLIT_BS, | |
DFA_SLASH, | |
DFA_IDENT, | |
DFA_OP, | |
DFA_OP2, | |
DFA_NUMLIT, | |
DFA_NUM_STATES, | |
DFA_PREPROC, | |
DFA_MACRO, | |
DFA_TYPE, | |
DFA_KEYWORD, | |
DFA_FUNCTION, | |
DFA_PARAM, | |
DFA_LABEL, | |
}; | |
// This is an enum for categorizing C++ source code characters. | |
// Since the lexer uses a table-based DFA, all of its relevant | |
// char types need to be numerically adjacent, so that they can | |
// index a contiguous cache-friendly table. | |
// Strictly speaking, a real lexer would also process digraphs, | |
// but digraphs are rarely used. Trigraphs are never used. | |
enum Char_Type : u8 { | |
WHITE , | |
NEWLINE , // '\r' '\n' | |
IDENT , // '$' '@'-'Z' '_'-'z' | |
DOUBLEQUOTE, // '"' | |
SINGLEQUOTE, // '\'' | |
DIGIT , // '0'-'9' | |
SLASH , // '/' | |
STAR , // '*' | |
BS , // '\\' | |
OP , | |
NUM_CHAR_TYPES, | |
}; | |
struct Lexeme { | |
const u8* i; | |
u8 dfa; | |
u8 cached_first; | |
CH_FORCEINLINE u8 c() const { return cached_first; } | |
}; | |
bool is_keyword(const Lexeme* l); | |
void parse_cpp(Buffer* b); | |
} // namespace parsing |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment