Last active
June 14, 2023 16:56
-
-
Save pervognsen/47bc3a24bf11f9be9ca84a18af4d5934 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <tuple> | |
#include <vector> | |
#include <string> | |
typedef uint32_t Str; | |
std::vector<const char*> strs; | |
enum { STR_BIAS = 1 << 21 }; | |
// The different bitwise tagging schemes combine to give an inline literal encoding of all three-character tokens | |
// with tight or loose whitespace trivia, which covers all operator tokens, several keywords and short identifiers. | |
Str strn(const char *s, size_t n) { | |
if (n <= 3) { | |
uint32_t i = 0; | |
if (n >= 1) i |= s[0]; | |
if (n >= 2) i |= s[1] << 7; | |
if (n >= 3) i |= s[2] << 14; | |
return i; | |
} | |
uint32_t i; | |
// In a real implementation this would be a hash table. One of my experimental ideas is to use content-defined | |
// chunking to define cut-points based on rolling hashes and then split and cons recursively to promote sharing. | |
for (i = 0; i < strs.size(); i++) | |
if (strncmp(strs[i], s, n) == 0) return i + STR_BIAS; | |
strs.push_back(_strdup(s)); | |
return i + STR_BIAS; | |
} | |
Str str(const char *s) { | |
return strn(s, strlen(s)); | |
} | |
// Since the AST is immutable, we can do typeless bitwise sharing of data. You could also use unaligned, | |
// overlapping substring matching like an LZ77 dictionary compressor. In practice you'd want to have more | |
// than just a 2:1 compressor but it's a universal interning primitive and it promotes data sharing, so if | |
// nothing else it's a good demonstration of hash consing as recursive dictionary compression. | |
std::vector<uint64_t> heap(1); | |
// cons is a 2:1 data compressor when (car, cdr) is already in the heap. You could dedup swapped pairs by | |
// storing them ordered in the heap and stealing a tag bit to store the direction so you can swap back in car/cdr. | |
uint32_t cons(uint32_t car, uint32_t cdr) { | |
uint64_t p = (uint64_t)car | ((uint64_t)cdr << 32); | |
uint32_t i; | |
// A real implementation would speed up the dedup match search with a hash table. But the hash table isn't needed | |
// to navigate the heap, and you can always rebuild it from the heap in linear time if you've thrown it away. | |
for (i = 1; i < heap.size(); i++) | |
if (heap[i] == p) return i; | |
heap.push_back(p); | |
return i; | |
} | |
uint64_t uncons(uint32_t i) { | |
return heap[i]; | |
} | |
uint32_t car(uint32_t i) { | |
return (uint32_t)heap[i]; | |
} | |
uint32_t cdr(uint32_t i) { | |
return (uint32_t)(heap[i] >> 32); | |
} | |
uint32_t bitcons(uint32_t n, uint32_t car, uint32_t cdr) { | |
// If you want to enforce powers of two: assert((n & (n - 1)) == 0); | |
assert(car < n); | |
assert(cdr < (1ull << 32) / n - (n - 1)); | |
return car + cdr * n; | |
} | |
uint32_t bitcar(uint32_t n, uint32_t p) { | |
return p % n; | |
} | |
uint32_t bitcdr(uint32_t n, uint32_t p) { | |
return p / n; | |
} | |
typedef uint32_t Trivia; | |
Trivia trivia(Str lhs, Str rhs) { | |
if (rhs == 0) return bitcons(2, 0, lhs); // Empty right trivia | |
return bitcons(2, 1, cons(lhs, rhs)); | |
} | |
typedef uint32_t Token; | |
Token token(Str str, Trivia triv) { | |
if (triv == 0) return bitcons(3, 0, str); // Tight (nothing on either side) | |
if (triv == 64) return bitcons(3, 1, str); // Loose (one space on left, nothing on right) | |
return bitcons(3, 2, cons(str, triv)); | |
} | |
typedef uint32_t Expr; | |
Expr name(Token tok) { | |
return bitcons(3, 0, tok); | |
} | |
Expr unary(Token op, Expr lhs) { | |
return bitcons(3, 1, cons(op, lhs)); | |
} | |
Expr binary(Token op, Expr lhs, Expr rhs) { | |
return bitcons(3, 2, cons(op, cons(lhs, rhs))); | |
} | |
int main(int argc, char** argv) { | |
auto empty = str(""); | |
auto space = str(" "); | |
auto tight = trivia(empty, empty); | |
auto loose = trivia(space, empty); | |
auto x = name(token(str("x"), tight)); | |
auto y = name(token(str("y"), loose)); | |
auto plus = token(str("+"), loose); | |
auto add_xy = binary(plus, x, y); | |
auto add_xy2 = binary(token(str("+"), loose), name(token(str("x"), tight)), name(token(str("y"), loose))); | |
assert(add_xy == add_xy2); | |
auto xor_xy = binary(token(str("^"), loose), x, y); | |
assert(xor_xy != add_xy); | |
assert(bitcar(3, add_xy) == bitcar(3, xor_xy)); | |
assert(cdr(bitcdr(3, add_xy)) == cdr(bitcdr(3, xor_xy))); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment