Skip to content

Instantly share code, notes, and snippets.

@RealNeGate
Last active February 18, 2022 08:03
Show Gist options
  • Select an option

  • Save RealNeGate/18beaa4c5427aaf0c7b71a2e90fec0e2 to your computer and use it in GitHub Desktop.

Select an option

Save RealNeGate/18beaa4c5427aaf0c7b71a2e90fec0e2 to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <x86intrin.h>
#include <Windows.h>
enum {
TOKEN_ACCESSOR = '.',
TOKEN_ADDITION = '+',
TOKEN_ASSIGN = '=',
TOKEN_PAREN_OPEN = '(',
TOKEN_PAREN_CLOSE = ')',
TOKEN_BRACE_OPEN = '{',
TOKEN_BRACE_CLOSE = '}',
TOKEN_IDENTIFIER = 256,
TOKEN_KW_IN,
TOKEN_KW_PRIVATE,
TOKEN_KW_PRIMITIVE,
TOKEN_KW_PROTECTED,
TOKEN_KW_INSTANCEOF,
TOKEN_INVALID,
TOKEN_STRING,
TOKEN_ELLIPSIS, /* ... */
TOKEN_INCREMENT, /* ++ */
TOKEN_EQUALITY, /* == */
TOKEN_STRICT_EQUALITY, /* === */
TOKEN_FAT_ARROW, /* => */
};
typedef struct Lexer {
const char* current;
// current token info
int token_type;
const char* token_start;
const char* token_end;
} Lexer;
static _Alignas(64) uint8_t identifier_char_tbl[] = {
0x00,0x00,0x00,0x00,
0x10,0x00,0xff,0x03,
0xfe,0xff,0xff,0x87,
0xfe,0xff,0xff,0x07,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00
};
__attribute__((always_inline))
static bool is_identifier_not_first_char(char ch) {
size_t i = ch;
size_t index = i / 8;
size_t shift = i & 7;
return (identifier_char_tbl[index] >> shift) & 1;
}
static _Alignas(64) uint8_t space_char_tbl[] = {
0x00,0x26,0x00,0x00,
0x01,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00
};
__attribute__((always_inline))
static bool is_space(char ch) {
size_t i = ch;
size_t index = i / 8;
size_t shift = i & 7;
return (space_char_tbl[index] >> shift) & 1;
}
void next(Lexer* restrict l) {
const char* current = l->current;
// branchless space skip
current += (*current == ' ');
redo_lex:;
const char* start = current;
switch (*start) {
case '\0':
l->token_type = '\0';
break;
case '\r':
case '\n':
case ' ':
case '\t':
case '\v':
// slow path
do {
current++;
} while (is_space(*current));
goto redo_lex;
case '\"':
current++;
do {
__m128i chars = _mm_loadu_si128((__m128i *)current);
int len = __builtin_ffs(_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\"'))));
if (len) {
current += len;
if (current[-1] == '\"' && current[-2] != '\\') break;
} else {
current += 16;
}
} while (*current);
l->token_type = TOKEN_STRING;
break;
case '=':
current++;
if (*current == '>') {
current++;
l->token_type = TOKEN_FAT_ARROW;
break;
}
current += (*current == '=');
current += (*current == '=');
const static int tbl[] = { TOKEN_ASSIGN, TOKEN_EQUALITY, TOKEN_STRICT_EQUALITY };
l->token_type = tbl[current-start];
break;
case '+':
current++;
current += (*current == '+');
l->token_type = ((current-start) == 2) ? TOKEN_INCREMENT : TOKEN_ADDITION;
break;
case '.':
current++;
// let's it know that it can remove
// the short circuit if it wanted.
char next0 = current[1];
char next1 = current[2];
if (next0 == '.' && next1 == '.') {
current += 2;
l->token_type = TOKEN_ELLIPSIS;
break;
}
l->token_type = TOKEN_ACCESSOR;
break;
case '(':
case ')':
case '{':
case '}':
current++;
l->token_type = *start;
break;
case 'A' ... 'Z':
case 'a' ... 'z':
case '_': case '$': {
__builtin_prefetch(identifier_char_tbl);
do {
current++;
} while (is_identifier_not_first_char(*((unsigned char*)current)));
int t = TOKEN_IDENTIFIER;
size_t len = current - start;
if (*start == 'i') {
if (len == 2 && start[1] == 'n') t = TOKEN_KW_IN;
else if (len == 10 && memcmp(start, "instanceof", 10) == 0) t = TOKEN_KW_INSTANCEOF;
} else {
if (len == 7 && memcmp(start, "private", 7) == 0) t = TOKEN_KW_PRIVATE;
else if (len == 9 && memcmp(start, "primitive", 9) == 0) t = TOKEN_KW_PRIMITIVE;
else if (len == 9 && memcmp(start, "protected", 9) == 0) t = TOKEN_KW_PROTECTED;
}
l->token_type = t;
break;
}
default:
l->token_type = TOKEN_INVALID;
break;
}
l->token_start = start;
l->token_end = current;
l->current = current;
}
static uint64_t get_timer_counter() {
LARGE_INTEGER t;
QueryPerformanceCounter(&t);
return t.QuadPart;
}
static double get_timer_frequency() {
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq);
return 1.0 / (double)freq.QuadPart;
}
#define NUM_ITERS 100000
int main(int argc, char **argv) {
if (argc == 1) {
printf("No input files!\n");
return 1;
}
FILE *f = fopen(argv[1],"rb");
char *text = (char *) malloc(1 << 20);
int len = f ? (int) fread(text, 1, 1<<20, f) : -1;
text[len] = '\0';
if (len < 0) {
printf("Error opening file\n");
free(text);
fclose(f);
return 1;
}
fclose(f);
uint64_t total_ticks = 0;
for (int i = 0; i < NUM_ITERS; i++) {
uint64_t start = get_timer_counter();
// Run lexer
Lexer l = { .current = text };
do {
next(&l);
//printf("%.*s\n", (int)(l.token_end - l.token_start), l.token_start);
} while (l.token_type);
uint64_t end = get_timer_counter();
total_ticks += (end - start);
}
double average = ((double)total_ticks / NUM_ITERS) * get_timer_frequency();
double average_ms = average / 1000000.0;
double bandwidth = (double)len / average;
printf("Input length: %d\n", len);
printf("Average time: %f us\n", average_ms);
printf("Bandwidth: %f GB/s\n", bandwidth / 1000000000.0);
free(text);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment