Created
November 18, 2023 15:53
-
-
Save 8dcc/9e3d9da86b113cd54894b0a56b6548e7 to your computer and use it in GitHub Desktop.
Slightly modified syntax highlighter from StackOverflow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <ctype.h> | |
#include <errno.h> | |
/* | |
* C syntax highlighter to stdout. | |
* Source: https://stackoverflow.com/a/77465783/11715554 | |
*/ | |
// XXX: no support for UTF-8 identifiers | |
// XXX: no support for universal-character-name in identifiers | |
enum c_token_type { | |
END, | |
WHITESPACE, | |
NEWLINE, | |
COMMENT, | |
PREPROCESSOR, | |
KEYWORD, | |
IDENTIFIER, | |
STRING, | |
CHARCONST, | |
NUMBER, | |
OPERATOR, | |
CTYPE, | |
FUNCALL, | |
OTHER, | |
ERROR, | |
}; | |
struct c_parse_context { | |
const char* filename; | |
const char* source; | |
const char* p; | |
const char* token_start; | |
char token_string[80]; | |
int line_number; | |
int column_number; | |
int at_bol; | |
int in_preprocess; | |
//... | |
}; | |
static const char c_keywords[] = | |
" auto break case const continue default do else enum extern for " | |
" goto if inline register restrict return sizeof static struct " | |
" switch typedef union volatile while " | |
/* types */ | |
" char double float int long unsigned short signed void " | |
/* C99 and C11 keywords */ | |
" _Alignas _Alignof _Atomic _Generic _Noreturn _Static_assert " | |
" _Thread_local " | |
/* C99 and C11 types */ | |
" _Bool _Complex _Imaginary " | |
/* C23 keywords */ | |
" alignas alignof constexpr false nullptr static_assert thread_local " | |
" true typeof typeof_unqual " | |
/* C23 types */ | |
" bool _BitInt _Decimal128 _Decimal32 _Decimal64 "; | |
static const char c_types[] = | |
/* types */ | |
" char double float int long unsigned short signed void va_list " | |
/* C99 and C11 types */ | |
" _Bool _Complex _Imaginary " | |
/* C23 types */ | |
" bool _BitInt _Decimal128 _Decimal32 _Decimal64 " | |
/* common standard types */ | |
" FILE va_list "; | |
static const char c_punctuators[] = | |
" [ ] ( ) { } . -> ++ -- & * + - ~ ! / % << >> < > <= >= == != ^ | " | |
" && || ? : :: ; ... = *= /= %= += -= <<= >>= &= ^= |= , # ## " | |
" <: :> <% %> %: %:%: "; | |
static int c_find_word(const char* words, const char* s, int len) { | |
for (const char* p = words; (p = strchr(p, *s)) != NULL; p++) | |
if (p[-1] == ' ' && !strncmp(p, s, len) && p[len] == ' ') | |
return 1; | |
return 0; | |
} | |
static int c_is_keyword(const char* s, int len) { | |
return c_find_word(c_keywords, s, len); | |
} | |
static int c_is_type(const char* s, int len) { | |
return c_find_word(c_types, s, len) || | |
(len > 2 && s[len - 2] == '_' && s[len - 1] == 't'); | |
} | |
static int c_getc(struct c_parse_context* pc) { | |
for (;;) { | |
int c = (unsigned char)*pc->p++; | |
if (c == '\0') { | |
pc->p--; | |
return 0; | |
} | |
if (c == '\r') { // convert end of line sequences to '\n' | |
if (*pc->p == '\n') | |
pc->p += 1; | |
return '\n'; | |
} | |
#if 0 // trigraphs can be handled here. | |
if (c == '?' && *pc->p == '?') { | |
switch (pc->p[1]) { | |
case '=': c = '#'; pc->p += 2; break; | |
case '(': c = '['; pc->p += 2; break; | |
case '/': c = '\\'; pc->p += 2; break; | |
case ')': c = ']'; pc->p += 2; break; | |
case '\’': c = '^'; pc->p += 2; break; | |
case '<': c = '{'; pc->p += 2; break; | |
case '!': c = '|'; pc->p += 2; break; | |
case '>': c = '}'; pc->p += 2; break; | |
case '-': c = '~'; pc->p += 2; break; | |
} | |
} | |
#endif | |
if (c == '\\') { // remove escaped newlines | |
if (*pc->p == '\n') { | |
pc->p++; | |
continue; | |
} | |
if (*pc->p == '\r') { | |
pc->p++; | |
if (*pc->p == '\n') | |
pc->p++; | |
continue; | |
} | |
} | |
return c; | |
} | |
} | |
static int c_peekc(struct c_parse_context* pc) { | |
const char* start = pc->p; | |
int c = c_getc(pc); | |
pc->p = start; | |
return c; | |
} | |
static int c_peekc2(struct c_parse_context* pc) { | |
const char* start = pc->p; | |
int c = c_getc(pc); | |
c = c_getc(pc); | |
pc->p = start; | |
return c; | |
} | |
static inline int c_isalnum_(int c) { | |
return isalnum(c) || c == '_'; | |
} | |
static int c_parse_number(struct c_parse_context* pc, int lastc) { | |
/* parse a pp-number, the grammar allows for invalid numbers */ | |
int c; | |
for (; (c = c_peekc(pc)) != '\0'; lastc = c, c_getc(pc)) { | |
if (!c_isalnum_(c) && c != '.') { | |
if (c == '+' || c == '-') { | |
if (!memchr("eEpP", lastc, 4)) | |
break; | |
} else if (c == '\'') { // C23 digit separators | |
if (!c_isalnum_(c_peekc2(pc))) | |
break; | |
} else { | |
break; | |
} | |
} | |
} | |
return NUMBER; | |
} | |
static int c_parse_string(struct c_parse_context* pc, int sep) { | |
int c; | |
while ((c = c_peekc(pc)) != '\0' && c != '\n') { | |
c_getc(pc); | |
if (c == sep) | |
return (sep == '\'') ? CHARCONST : STRING; | |
if (c == '\\' && c_getc(pc) == '\0') | |
break; | |
} | |
// unterminated string or character constant | |
return ERROR; | |
} | |
static int c_parse_operator(struct c_parse_context* pc, int c) { | |
const char* save[4]; | |
size_t len = 0; | |
for (size_t i = 0;;) { | |
pc->token_string[i] = (char)c; | |
save[i] = pc->p; | |
i++; | |
if (c_find_word(c_punctuators, pc->token_string, i)) | |
len = i; | |
if (i == 4 || !ispunct(c = c_getc(pc))) | |
break; | |
} | |
if (len) { | |
pc->p = save[len - 1]; | |
pc->token_string[len] = '\0'; | |
return OPERATOR; | |
} else { | |
pc->p = save[0]; | |
return OTHER; | |
} | |
} | |
static int c_parse_comment1(struct c_parse_context* pc) { | |
int c; | |
while ((c = c_peekc(pc)) != '\0' && c != '\n') | |
c_getc(pc); | |
return COMMENT; | |
} | |
static int c_parse_comment2(struct c_parse_context* pc) { | |
int c; | |
while ((c = c_peekc(pc)) != '\0') { | |
c_getc(pc); | |
if (c == '*' && c_peekc(pc) == '/') { | |
c_getc(pc); | |
return COMMENT; | |
} | |
} | |
// unterminated comment | |
return ERROR; | |
} | |
static int c_parse_identifier(struct c_parse_context* pc, int c) { | |
size_t len = 0; | |
pc->token_string[len++] = (char)c; | |
while (c_isalnum_(c = c_peekc(pc))) { | |
if (len < sizeof(pc->token_string) - 1) | |
pc->token_string[len++] = (char)c; | |
c_getc(pc); | |
} | |
pc->token_string[len] = '\0'; | |
if (c_is_type(pc->token_string, len)) | |
return CTYPE; | |
if (c_is_keyword(pc->token_string, len)) | |
return KEYWORD; | |
if (isblank(c)) | |
c = c_peekc2(pc); | |
if (c == '(') | |
return FUNCALL; | |
return IDENTIFIER; | |
} | |
enum c_token_type c_get_token(struct c_parse_context* pc) { | |
int c, c1, c2; | |
pc->token_start = pc->p; | |
c = c_getc(pc); | |
if (isspace(c)) { | |
if (c == '\n') { | |
pc->at_bol = 1; | |
pc->in_preprocess = 0; | |
return NEWLINE; | |
} | |
while (memchr(" \t\f\v", c_peekc(pc), 4)) | |
c_getc(pc); | |
return WHITESPACE; | |
} | |
if (pc->at_bol) { | |
pc->at_bol = 0; | |
if (c == '#') { | |
pc->at_bol = 0; | |
pc->in_preprocess = 1; | |
return PREPROCESSOR; | |
} | |
} | |
switch (c) { | |
case '\0': | |
return END; | |
case '/': | |
if (c_peekc(pc) == '/') { | |
c_getc(pc); | |
return c_parse_comment1(pc); | |
} | |
if (c_peekc(pc) == '*') { | |
c_getc(pc); | |
return c_parse_comment2(pc); | |
} | |
break; | |
case '.': | |
if (isdigit(c_peekc(pc))) | |
return c_parse_number(pc, c); | |
break; | |
case '0': | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': | |
return c_parse_number(pc, c); | |
case '\'': | |
case '\"': | |
return c_parse_string(pc, c); | |
case 'L': | |
case 'U': | |
if ((c1 = c_peekc(pc)) == '\'' || c1 == '\"') | |
return c_parse_string(pc, c_getc(pc)); | |
break; | |
case 'u': | |
if ((c1 = c_peekc(pc)) == '8' && | |
((c2 = c_peekc2(pc)) == '\'' || c2 == '\"')) { | |
c_getc(pc); | |
return c_parse_string(pc, c_getc(pc)); | |
} | |
if (c1 == '\'' || c1 == '\"') | |
return c_parse_string(pc, c_getc(pc)); | |
break; | |
} | |
// XXX: should handle UTF-8 and universal-character-name here | |
if (c_isalnum_(c)) | |
return c_parse_identifier(pc, c); | |
return c_parse_operator(pc, c); | |
} | |
// C token colorizer | |
// ANSI colors | |
#define RESET "\033[0m" | |
#define BLACK "\033[30m" | |
#define RED "\033[31m" | |
#define GREEN "\033[32m" | |
#define YELLOW "\033[33m" | |
#define BLUE "\033[34m" | |
#define MAGENTA "\033[35m" | |
#define CYAN "\033[36m" | |
#define WHITE "\033[37m" | |
#define GREY "\033[90m" | |
#define BRIGHT_RED "\033[91m" | |
#define BRIGHT_GREEN "\033[92m" | |
#define BRIGHT_YELLOW "\033[93m" | |
#define BRIGHT_BLUE "\033[94m" | |
#define BRIGHT_MAGENTA "\033[95m" | |
#define BRIGHT_CYAN "\033[96m" | |
#define BRIGHT_WHITE "\033[97m" | |
#define DEFAULT BRIGHT_GREEN | |
const char* const c_colors[] = { | |
[END] = RESET, | |
[WHITESPACE] = DEFAULT, | |
[NEWLINE] = RESET, | |
[COMMENT] = WHITE, | |
[PREPROCESSOR] = CYAN, | |
[KEYWORD] = BRIGHT_WHITE, | |
[IDENTIFIER] = DEFAULT, | |
[STRING] = BRIGHT_CYAN, | |
[CHARCONST] = BRIGHT_CYAN, | |
[NUMBER] = GREEN, | |
[OPERATOR] = DEFAULT, | |
[CTYPE] = BRIGHT_MAGENTA, | |
[FUNCALL] = BRIGHT_YELLOW, | |
[OTHER] = RED, | |
[ERROR] = RED, | |
}; | |
void c_colorize(const char* filename, const char* source) { | |
struct c_parse_context ctx = { | |
filename, source, source, NULL, { 0 }, 1, 1, 1, 0, | |
}; | |
enum c_token_type last_color = END; | |
for (;;) { | |
enum c_token_type tok_type = c_get_token(&ctx); | |
const char* s = ctx.token_start; | |
int len = ctx.p - s; | |
enum c_token_type color = tok_type; | |
if (ctx.in_preprocess && tok_type != COMMENT) | |
color = PREPROCESSOR; | |
if (last_color != color) { | |
if (c_colors[color]) | |
fputs(c_colors[color], stdout); | |
last_color = color; | |
} | |
if (tok_type == END) | |
break; | |
printf("%.*s", len, s); | |
s += len; | |
} | |
} | |
char* load_file(const char* filename, FILE* fp) { | |
char buf[4096]; | |
char* source = NULL; | |
size_t len = 0; | |
size_t nread; | |
FILE* fp_close = NULL; | |
if (fp == NULL) { | |
fp = fopen(filename, "r"); | |
if (fp == NULL) { | |
fprintf(stderr, "cannot open %s: %s\n", filename, strerror(errno)); | |
return NULL; | |
} | |
fp_close = fp; | |
} | |
while ((nread = fread(buf, 1, sizeof buf, fp)) > 0) { | |
char* new_buf = realloc(source, len + nread + 1); | |
if (new_buf == NULL) { | |
fprintf(stderr, "out of memory for %s\n", filename); | |
free(source); | |
source = NULL; | |
break; | |
} | |
source = new_buf; | |
memcpy(source + len, buf, nread); | |
len += nread; | |
source[len] = '\0'; | |
} | |
if (fp_close) | |
fclose(fp_close); | |
return source; | |
} | |
int main(int argc, char* argv[]) { | |
if (argc > 1) { | |
for (int i = 1; i < argc; i++) { | |
char* source = load_file(argv[i], NULL); | |
if (!source) | |
continue; | |
c_colorize(argv[i], source); | |
free(source); | |
} | |
} else { | |
char* source = load_file("<stdin>", stdin); | |
if (source) { | |
c_colorize("<stdin>", source); | |
free(source); | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Revision number: 5