Last active
February 18, 2022 08:03
-
-
Save RealNeGate/18beaa4c5427aaf0c7b71a2e90fec0e2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <stdint.h> | |
| #include <stdbool.h> | |
| #include <string.h> | |
| #include <x86intrin.h> | |
| #include <Windows.h> | |
| enum { | |
| TOKEN_ACCESSOR = '.', | |
| TOKEN_ADDITION = '+', | |
| TOKEN_ASSIGN = '=', | |
| TOKEN_PAREN_OPEN = '(', | |
| TOKEN_PAREN_CLOSE = ')', | |
| TOKEN_BRACE_OPEN = '{', | |
| TOKEN_BRACE_CLOSE = '}', | |
| TOKEN_IDENTIFIER = 256, | |
| TOKEN_KW_IN, | |
| TOKEN_KW_PRIVATE, | |
| TOKEN_KW_PRIMITIVE, | |
| TOKEN_KW_PROTECTED, | |
| TOKEN_KW_INSTANCEOF, | |
| TOKEN_INVALID, | |
| TOKEN_STRING, | |
| TOKEN_ELLIPSIS, /* ... */ | |
| TOKEN_INCREMENT, /* ++ */ | |
| TOKEN_EQUALITY, /* == */ | |
| TOKEN_STRICT_EQUALITY, /* === */ | |
| TOKEN_FAT_ARROW, /* => */ | |
| }; | |
| typedef struct Lexer { | |
| const char* current; | |
| // current token info | |
| int token_type; | |
| const char* token_start; | |
| const char* token_end; | |
| } Lexer; | |
| static _Alignas(64) uint8_t identifier_char_tbl[] = { | |
| 0x00,0x00,0x00,0x00, | |
| 0x10,0x00,0xff,0x03, | |
| 0xfe,0xff,0xff,0x87, | |
| 0xfe,0xff,0xff,0x07, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00 | |
| }; | |
| __attribute__((always_inline)) | |
| static bool is_identifier_not_first_char(char ch) { | |
| size_t i = ch; | |
| size_t index = i / 8; | |
| size_t shift = i & 7; | |
| return (identifier_char_tbl[index] >> shift) & 1; | |
| } | |
| static _Alignas(64) uint8_t space_char_tbl[] = { | |
| 0x00,0x26,0x00,0x00, | |
| 0x01,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00, | |
| 0x00,0x00,0x00,0x00 | |
| }; | |
| __attribute__((always_inline)) | |
| static bool is_space(char ch) { | |
| size_t i = ch; | |
| size_t index = i / 8; | |
| size_t shift = i & 7; | |
| return (space_char_tbl[index] >> shift) & 1; | |
| } | |
| void next(Lexer* restrict l) { | |
| const char* current = l->current; | |
| // branchless space skip | |
| current += (*current == ' '); | |
| redo_lex:; | |
| const char* start = current; | |
| switch (*start) { | |
| case '\0': | |
| l->token_type = '\0'; | |
| break; | |
| case '\r': | |
| case '\n': | |
| case ' ': | |
| case '\t': | |
| case '\v': | |
| // slow path | |
| do { | |
| current++; | |
| } while (is_space(*current)); | |
| goto redo_lex; | |
| case '\"': | |
| current++; | |
| do { | |
| __m128i chars = _mm_loadu_si128((__m128i *)current); | |
| int len = __builtin_ffs(_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\"')))); | |
| if (len) { | |
| current += len; | |
| if (current[-1] == '\"' && current[-2] != '\\') break; | |
| } else { | |
| current += 16; | |
| } | |
| } while (*current); | |
| l->token_type = TOKEN_STRING; | |
| break; | |
| case '=': | |
| current++; | |
| if (*current == '>') { | |
| current++; | |
| l->token_type = TOKEN_FAT_ARROW; | |
| break; | |
| } | |
| current += (*current == '='); | |
| current += (*current == '='); | |
| const static int tbl[] = { TOKEN_ASSIGN, TOKEN_EQUALITY, TOKEN_STRICT_EQUALITY }; | |
| l->token_type = tbl[current-start]; | |
| break; | |
| case '+': | |
| current++; | |
| current += (*current == '+'); | |
| l->token_type = ((current-start) == 2) ? TOKEN_INCREMENT : TOKEN_ADDITION; | |
| break; | |
| case '.': | |
| current++; | |
| // let's it know that it can remove | |
| // the short circuit if it wanted. | |
| char next0 = current[1]; | |
| char next1 = current[2]; | |
| if (next0 == '.' && next1 == '.') { | |
| current += 2; | |
| l->token_type = TOKEN_ELLIPSIS; | |
| break; | |
| } | |
| l->token_type = TOKEN_ACCESSOR; | |
| break; | |
| case '(': | |
| case ')': | |
| case '{': | |
| case '}': | |
| current++; | |
| l->token_type = *start; | |
| break; | |
| case 'A' ... 'Z': | |
| case 'a' ... 'z': | |
| case '_': case '$': { | |
| __builtin_prefetch(identifier_char_tbl); | |
| do { | |
| current++; | |
| } while (is_identifier_not_first_char(*((unsigned char*)current))); | |
| int t = TOKEN_IDENTIFIER; | |
| size_t len = current - start; | |
| if (*start == 'i') { | |
| if (len == 2 && start[1] == 'n') t = TOKEN_KW_IN; | |
| else if (len == 10 && memcmp(start, "instanceof", 10) == 0) t = TOKEN_KW_INSTANCEOF; | |
| } else { | |
| if (len == 7 && memcmp(start, "private", 7) == 0) t = TOKEN_KW_PRIVATE; | |
| else if (len == 9 && memcmp(start, "primitive", 9) == 0) t = TOKEN_KW_PRIMITIVE; | |
| else if (len == 9 && memcmp(start, "protected", 9) == 0) t = TOKEN_KW_PROTECTED; | |
| } | |
| l->token_type = t; | |
| break; | |
| } | |
| default: | |
| l->token_type = TOKEN_INVALID; | |
| break; | |
| } | |
| l->token_start = start; | |
| l->token_end = current; | |
| l->current = current; | |
| } | |
| static uint64_t get_timer_counter() { | |
| LARGE_INTEGER t; | |
| QueryPerformanceCounter(&t); | |
| return t.QuadPart; | |
| } | |
| static double get_timer_frequency() { | |
| LARGE_INTEGER freq; | |
| QueryPerformanceFrequency(&freq); | |
| return 1.0 / (double)freq.QuadPart; | |
| } | |
| #define NUM_ITERS 100000 | |
| int main(int argc, char **argv) { | |
| if (argc == 1) { | |
| printf("No input files!\n"); | |
| return 1; | |
| } | |
| FILE *f = fopen(argv[1],"rb"); | |
| char *text = (char *) malloc(1 << 20); | |
| int len = f ? (int) fread(text, 1, 1<<20, f) : -1; | |
| text[len] = '\0'; | |
| if (len < 0) { | |
| printf("Error opening file\n"); | |
| free(text); | |
| fclose(f); | |
| return 1; | |
| } | |
| fclose(f); | |
| uint64_t total_ticks = 0; | |
| for (int i = 0; i < NUM_ITERS; i++) { | |
| uint64_t start = get_timer_counter(); | |
| // Run lexer | |
| Lexer l = { .current = text }; | |
| do { | |
| next(&l); | |
| //printf("%.*s\n", (int)(l.token_end - l.token_start), l.token_start); | |
| } while (l.token_type); | |
| uint64_t end = get_timer_counter(); | |
| total_ticks += (end - start); | |
| } | |
| double average = ((double)total_ticks / NUM_ITERS) * get_timer_frequency(); | |
| double average_ms = average / 1000000.0; | |
| double bandwidth = (double)len / average; | |
| printf("Input length: %d\n", len); | |
| printf("Average time: %f us\n", average_ms); | |
| printf("Bandwidth: %f GB/s\n", bandwidth / 1000000000.0); | |
| free(text); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment