RealNeGate · February 18, 2022 08:03
diff --git a/thingy.c b/thingy.c
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
 #include <x86intrin.h>
 #include <Windows.h>

 enum {
    TOKEN_ACCESSOR = '.',

    TOKEN_ADDITION = '+',
    TOKEN_ASSIGN = '=',

    TOKEN_PAREN_OPEN = '(',
    TOKEN_PAREN_CLOSE = ')',

    TOKEN_BRACE_OPEN = '{',
    TOKEN_BRACE_CLOSE = '}',

    TOKEN_IDENTIFIER = 256,
    TOKEN_KW_IN,
    TOKEN_KW_PRIVATE,
    TOKEN_KW_PRIMITIVE,
    TOKEN_KW_PROTECTED,
    TOKEN_KW_INSTANCEOF,
 	
    TOKEN_INVALID,
    TOKEN_STRING,

    TOKEN_ELLIPSIS,   /* ... */
    TOKEN_INCREMENT,  /* ++ */
    TOKEN_EQUALITY,   /* == */
    TOKEN_STRICT_EQUALITY, /* === */
    TOKEN_FAT_ARROW,  /* => */
 };

 typedef struct Lexer {
    const char* current;

    // current token info
    int token_type;
    const char* token_start;
    const char* token_end;
 } Lexer;

 static _Alignas(64) uint8_t identifier_char_tbl[] = {
 	0x00,0x00,0x00,0x00,
 	0x10,0x00,0xff,0x03,
 	0xfe,0xff,0xff,0x87,
 	0xfe,0xff,0xff,0x07,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00
 };
 	
 __attribute__((always_inline))
 static bool is_identifier_not_first_char(char ch) {
 	size_t i = ch;
 	size_t index = i / 8;
 	size_t shift = i & 7;
 	return (identifier_char_tbl[index] >> shift) & 1;
 }

 static _Alignas(64) uint8_t space_char_tbl[] = {
 	0x00,0x26,0x00,0x00,
 	0x01,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00,
 	0x00,0x00,0x00,0x00
 };

 __attribute__((always_inline))
 static bool is_space(char ch) {
 	size_t i = ch;
 	size_t index = i / 8;
 	size_t shift = i & 7;
 	return (space_char_tbl[index] >> shift) & 1;
 }

 void next(Lexer* restrict l) {
    const char* current = l->current;

    // branchless space skip
    current += (*current == ' ');

    redo_lex:;
    const char* start = current;
    switch (*start) {
        case '\0':
            l->token_type = '\0';
            break;
        case '\r':
        case '\n':
        case ' ':
        case '\t':
        case '\v':
            // slow path
            do {
                current++;
            } while (is_space(*current));
            goto redo_lex;
        case '\"':
            current++;
 		
            do {
                __m128i chars = _mm_loadu_si128((__m128i *)current);
                int len = __builtin_ffs(_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\"'))));
 				
                if (len) {
                    current += len;
                    if (current[-1] == '\"' && current[-2] != '\\') break;
                } else {
                    current += 16;
                }
            } while (*current);

            l->token_type = TOKEN_STRING;
            break;
        case '=':
            current++;
            if (*current == '>') {
                current++;
                l->token_type = TOKEN_FAT_ARROW;
                break;
            }

            current += (*current == '=');
            current += (*current == '=');

            const static int tbl[] = { TOKEN_ASSIGN, TOKEN_EQUALITY, TOKEN_STRICT_EQUALITY };
            l->token_type = tbl[current-start];
            break;
        case '+':
            current++;
            current += (*current == '+');

            l->token_type = ((current-start) == 2) ? TOKEN_INCREMENT : TOKEN_ADDITION;
            break;
        case '.':
            current++;

            // let's it know that it can remove
            // the short circuit if it wanted.
            char next0 = current[1];
            char next1 = current[2];
            if (next0 == '.' && next1 == '.') {
                current += 2;
                l->token_type = TOKEN_ELLIPSIS;
                break;
            }

            l->token_type = TOKEN_ACCESSOR;
            break;
        case '(':
        case ')':
        case '{':
        case '}':
            current++;

            l->token_type = *start;
            break;
        case 'A' ... 'Z':
        case 'a' ... 'z':
        case '_': case '$': {
                __builtin_prefetch(identifier_char_tbl);
                do {
                    current++;
                } while (is_identifier_not_first_char(*((unsigned char*)current)));
 			
                int t = TOKEN_IDENTIFIER;
                size_t len = current - start;
                if (*start == 'i') {
                    if (len == 2 && start[1] == 'n') t = TOKEN_KW_IN;
                    else if (len == 10 && memcmp(start, "instanceof", 10) == 0) t = TOKEN_KW_INSTANCEOF;
                } else {
                    if (len == 7 && memcmp(start, "private", 7) == 0) t = TOKEN_KW_PRIVATE;
                    else if (len == 9 && memcmp(start, "primitive", 9) == 0) t = TOKEN_KW_PRIMITIVE;
                    else if (len == 9 && memcmp(start, "protected", 9) == 0) t = TOKEN_KW_PROTECTED;
                }

                l->token_type = t;
                break;
        }
        default: 
                l->token_type = TOKEN_INVALID;
                break;
    }

    l->token_start = start;
    l->token_end = current;
    l->current = current;
 }

 static uint64_t get_timer_counter() {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return t.QuadPart;
 }

 static double get_timer_frequency() {
    LARGE_INTEGER freq;
    QueryPerformanceFrequency(&freq);
    return 1.0 / (double)freq.QuadPart;
 }

 #define NUM_ITERS 100000

 int main(int argc, char **argv) {
    if (argc == 1) {
        printf("No input files!\n");
        return 1;
 	}
 	
    FILE *f = fopen(argv[1],"rb");
    char *text = (char *) malloc(1 << 20);
    int len = f ? (int) fread(text, 1, 1<<20, f) : -1;
    text[len] = '\0';
 	
    if (len < 0) {
        printf("Error opening file\n");
        free(text);
        fclose(f);
        return 1;
    }
    fclose(f);
 	
    uint64_t total_ticks = 0;
    for (int i = 0; i < NUM_ITERS; i++) {
        uint64_t start = get_timer_counter();
 		
        // Run lexer
        Lexer l = { .current = text };
        do {
            next(&l);
            //printf("%.*s\n", (int)(l.token_end - l.token_start), l.token_start);
        } while (l.token_type);
 		
        uint64_t end = get_timer_counter();
        total_ticks += (end - start);
    }
 	
    double average = ((double)total_ticks / NUM_ITERS) * get_timer_frequency();
    double average_ms = average / 1000000.0;
    double bandwidth = (double)len / average;
 	
    printf("Input length: %d\n", len);
    printf("Average time: %f us\n", average_ms);
    printf("Bandwidth: %f GB/s\n", bandwidth / 1000000000.0);
 	
    free(text);
    return 0;
 }
	#include <stdlib.h>
	#include <stdio.h>
	#include <stdint.h>
	#include <stdbool.h>
	#include <string.h>
	#include <x86intrin.h>
	#include <Windows.h>

	enum {
	TOKEN_ACCESSOR = '.',

	TOKEN_ADDITION = '+',
	TOKEN_ASSIGN = '=',

	TOKEN_PAREN_OPEN = '(',
	TOKEN_PAREN_CLOSE = ')',

	TOKEN_BRACE_OPEN = '{',
	TOKEN_BRACE_CLOSE = '}',

	TOKEN_IDENTIFIER = 256,
	TOKEN_KW_IN,
	TOKEN_KW_PRIVATE,
	TOKEN_KW_PRIMITIVE,
	TOKEN_KW_PROTECTED,
	TOKEN_KW_INSTANCEOF,

	TOKEN_INVALID,
	TOKEN_STRING,

	TOKEN_ELLIPSIS, /* ... */
	TOKEN_INCREMENT, /* ++ */
	TOKEN_EQUALITY, /* == */
	TOKEN_STRICT_EQUALITY, /* === */
	TOKEN_FAT_ARROW, /* => */
	};

	typedef struct Lexer {
	const char* current;

	// current token info
	int token_type;
	const char* token_start;
	const char* token_end;
	} Lexer;

	static _Alignas(64) uint8_t identifier_char_tbl[] = {
	0x00,0x00,0x00,0x00,
	0x10,0x00,0xff,0x03,
	0xfe,0xff,0xff,0x87,
	0xfe,0xff,0xff,0x07,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00
	};

	__attribute__((always_inline))
	static bool is_identifier_not_first_char(char ch) {
	size_t i = ch;
	size_t index = i / 8;
	size_t shift = i & 7;
	return (identifier_char_tbl[index] >> shift) & 1;
	}

	static _Alignas(64) uint8_t space_char_tbl[] = {
	0x00,0x26,0x00,0x00,
	0x01,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00
	};

	__attribute__((always_inline))
	static bool is_space(char ch) {
	size_t i = ch;
	size_t index = i / 8;
	size_t shift = i & 7;
	return (space_char_tbl[index] >> shift) & 1;
	}

	void next(Lexer* restrict l) {
	const char* current = l->current;

	// branchless space skip
	current += (*current == ' ');

	redo_lex:;
	const char* start = current;
	switch (*start) {
	case '\0':
	l->token_type = '\0';
	break;
	case '\r':
	case '\n':
	case ' ':
	case '\t':
	case '\v':
	// slow path
	do {
	current++;
	} while (is_space(*current));
	goto redo_lex;
	case '\"':
	current++;

	do {
	__m128i chars = _mm_loadu_si128((__m128i *)current);
	int len = __builtin_ffs(_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\"'))));

	if (len) {
	current += len;
	if (current[-1] == '\"' && current[-2] != '\\') break;
	} else {
	current += 16;
	}
	} while (*current);

	l->token_type = TOKEN_STRING;
	break;
	case '=':
	current++;
	if (*current == '>') {
	current++;
	l->token_type = TOKEN_FAT_ARROW;
	break;
	}

	current += (*current == '=');
	current += (*current == '=');

	const static int tbl[] = { TOKEN_ASSIGN, TOKEN_EQUALITY, TOKEN_STRICT_EQUALITY };
	l->token_type = tbl[current-start];
	break;
	case '+':
	current++;
	current += (*current == '+');

	l->token_type = ((current-start) == 2) ? TOKEN_INCREMENT : TOKEN_ADDITION;
	break;
	case '.':
	current++;

	// let's it know that it can remove
	// the short circuit if it wanted.
	char next0 = current[1];
	char next1 = current[2];
	if (next0 == '.' && next1 == '.') {
	current += 2;
	l->token_type = TOKEN_ELLIPSIS;
	break;
	}

	l->token_type = TOKEN_ACCESSOR;
	break;
	case '(':
	case ')':
	case '{':
	case '}':
	current++;

	l->token_type = *start;
	break;
	case 'A' ... 'Z':
	case 'a' ... 'z':
	case '_': case '$': {
	__builtin_prefetch(identifier_char_tbl);
	do {
	current++;
	} while (is_identifier_not_first_char(((unsigned char)current)));

	int t = TOKEN_IDENTIFIER;
	size_t len = current - start;
	if (*start == 'i') {
	if (len == 2 && start[1] == 'n') t = TOKEN_KW_IN;
	else if (len == 10 && memcmp(start, "instanceof", 10) == 0) t = TOKEN_KW_INSTANCEOF;
	} else {
	if (len == 7 && memcmp(start, "private", 7) == 0) t = TOKEN_KW_PRIVATE;
	else if (len == 9 && memcmp(start, "primitive", 9) == 0) t = TOKEN_KW_PRIMITIVE;
	else if (len == 9 && memcmp(start, "protected", 9) == 0) t = TOKEN_KW_PROTECTED;
	}

	l->token_type = t;
	break;
	}
	default:
	l->token_type = TOKEN_INVALID;
	break;
	}

	l->token_start = start;
	l->token_end = current;
	l->current = current;
	}

	static uint64_t get_timer_counter() {
	LARGE_INTEGER t;
	QueryPerformanceCounter(&t);
	return t.QuadPart;
	}

	static double get_timer_frequency() {
	LARGE_INTEGER freq;
	QueryPerformanceFrequency(&freq);
	return 1.0 / (double)freq.QuadPart;
	}

	#define NUM_ITERS 100000

	int main(int argc, char **argv) {
	if (argc == 1) {
	printf("No input files!\n");
	return 1;
	}

	FILE *f = fopen(argv[1],"rb");
	char text = (char ) malloc(1 << 20);
	int len = f ? (int) fread(text, 1, 1<<20, f) : -1;
	text[len] = '\0';

	if (len < 0) {
	printf("Error opening file\n");
	free(text);
	fclose(f);
	return 1;
	}
	fclose(f);

	uint64_t total_ticks = 0;
	for (int i = 0; i < NUM_ITERS; i++) {
	uint64_t start = get_timer_counter();

	// Run lexer
	Lexer l = { .current = text };
	do {
	next(&l);
	//printf("%.*s\n", (int)(l.token_end - l.token_start), l.token_start);
	} while (l.token_type);

	uint64_t end = get_timer_counter();
	total_ticks += (end - start);
	}

	double average = ((double)total_ticks / NUM_ITERS) * get_timer_frequency();
	double average_ms = average / 1000000.0;
	double bandwidth = (double)len / average;

	printf("Input length: %d\n", len);
	printf("Average time: %f us\n", average_ms);
	printf("Bandwidth: %f GB/s\n", bandwidth / 1000000000.0);

	free(text);
	return 0;
	}
No results found