Created
April 16, 2020 01:32
-
-
Save Kazark/28d191b5b5c074617c7e1de882537aee to your computer and use it in GitHub Desktop.
Usable Lisp-style symbols, without a global symbol table or interning or heavyweight strings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <limits.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
typedef struct MaybeULong { | |
const unsigned long nothing; | |
const unsigned long x; | |
} MaybeULong; | |
const MaybeULong justULong(const unsigned long x) { | |
MaybeULong j = { .nothing = 0, .x = x }; | |
return j; | |
} | |
const MaybeULong nothingULong = { .nothing = 1, .x = 0 }; | |
/* | |
I assert: | |
1. Having Lisp/Prolog-style symbols in your language is fantastic. | |
2. Having a global symbol table sucks. Giant global state. Blech. | |
3. Interning strings for symbols feels heavy. | |
So, can't we just represent symbols by words? (We'll assume that modern | |
hardware is moving enough toward 64-bit that we can safely equate "word" and | |
"64 bits".) Can't we just have some deterministic algorithm to convert back | |
and forth? The answer is, of course, "yes"; the real question is not the | |
tractability, but how to do this without driving the users of the language | |
crazy---bearing in mind, of course, that this is intended as a somewhat | |
low-level language for bootstrapping the system, and while it needs to be | |
readable and writable, it does not need to be anyone's favorite language, | |
unless a minimalism nerd, who won't mind a little bit of inconvenience anyhow. | |
It is desirable for the algorithm to be straightforward since we are going for | |
minimalism and speed. It should not be excessively difficult to implement, and | |
it should run fast, since symbols are going to be bread and butter. The | |
obvious thing to do is some human-friendly base-X encoding. Human-friendly, | |
not in the sense that the programmer can easily convert to the decimal form in | |
his mind (for that should not be important), but in that it provides you | |
enough characters to form relatively pleasant identifiers. A base-X encoding, | |
because that will be lossless. But what base should be used? | |
The primary factors when choosing a base are: | |
1. The number of characters available to the programmer in identifiers. | |
2. The maximum length of identifiers that will fit into 64 bits. | |
However, there is a further kink in that it would be nice to have the | |
syntactic convenience available in most languages, that identifiers are | |
distinguishable from numbers by the first character being non-numeric, despite | |
numeric symbols being allowed. | |
1 2 3 4 5 6 7 8 9 10 11 12 13 | |
2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁴ | |
5 10 15 20 25 30 35 40 45 50 55 60 64 | |
Little-endian. Twelve 32-character digits; the thirteenth is 16 characters. | |
An underscore can be used as syntactic sugar to skip specifying the rest of | |
the base-32 digits, i.e. populate them with a's (zeros), to allow an easy way | |
to name identifiers with numbers at the end, e.g. `x_0` and `x_1`, which will | |
be equivalent to `xaaaaaaaaaaa0` and `xaaaaaaaaaaa1`, respectively. | |
*/ | |
#define BASE 32 | |
#define MAX_SYMBOL_LENGTH 13 | |
#define DIGIT_13_BASE 16 | |
const unsigned long compressChar(const char symChar) { | |
switch (symChar) { | |
case 'a': return 0; | |
case 'b': return 1; | |
case 'c': return 2; | |
case 'd': return 3; | |
case 'e': return 4; | |
case 'f': return 5; | |
case 'g': return 6; | |
case 'h': return 7; | |
case 'i': return 8; | |
case 'j': return 9; | |
case 'k': return 10; | |
case 'l': return 11; | |
case 'm': return 12; | |
case 'n': return 13; | |
case 'o': return 14; | |
case 'p': return 15; | |
case 'q': return 16; | |
case 'r': return 17; | |
case 's': return 18; | |
case 't': return 19; | |
case 'u': return 20; | |
case 'v': return 21; | |
case 'w': return 22; | |
case 'x': return 23; | |
case 'y': return 24; | |
case 'z': return 25; | |
case '-': return 26; | |
case '+': return 27; | |
case '/': return 28; | |
case '*': return 29; | |
case '?': return 30; | |
case '!': return 31; | |
default: return BASE; | |
} | |
} | |
const unsigned long compress13thChar(const char symChar) { | |
switch (symChar) { | |
case '0': return 0; | |
case '1': return 1; | |
case '2': return 2; | |
case '3': return 3; | |
case '4': return 4; | |
case '5': return 5; | |
case '6': return 6; | |
case '7': return 7; | |
case '8': return 8; | |
case '9': return 9; | |
case '-': return 10; | |
case '+': return 11; | |
case '/': return 12; | |
case '*': return 13; | |
case '?': return 14; | |
case '!': return 15; | |
default: return DIGIT_13_BASE; | |
} | |
} | |
const MaybeULong compressSymbol(const char* const symbol) { | |
unsigned long acc = 0; // `acc` is our accumulator | |
unsigned long c = 0; // `c` is our loop counter | |
unsigned long i = 1; // `i` is our digit power counter | |
unsigned long base = BASE; | |
// Compression function is different for the 13th digit | |
const unsigned long (*compress)(const char) = compressChar; | |
// Iterate through the c-string. | |
while (*(c+symbol)) { | |
if (compress == compress13thChar) { | |
return nothingULong; // Prevent overflows | |
} else if (c == MAX_SYMBOL_LENGTH - 1) { | |
compress = compress13thChar; | |
} | |
if (compress != compress13thChar && *(c+symbol) == '_') { | |
for (unsigned long j = c; j < MAX_SYMBOL_LENGTH - 1; j++) { | |
// Empower the digit counter | |
i *= base; | |
} | |
// Increment the loop counter | |
c++; | |
compress = compress13thChar; | |
base = DIGIT_13_BASE; | |
} | |
// Convert a single symbol character to its numeric value | |
unsigned long x = compress(*(c+symbol)); | |
// The character is not a valid symbol character | |
if (x >= base) return nothingULong; | |
// Accumulate | |
acc += i * x; | |
// Increment the loop counter | |
c++; | |
// Empower the digit counter | |
i *= base; | |
if (c == MAX_SYMBOL_LENGTH - 1) { | |
base = DIGIT_13_BASE; | |
} | |
} | |
// If we had a problem we would have short-circuited before now | |
return justULong(acc); | |
} | |
const char decompressChar(const unsigned long symChar) { | |
switch (symChar) { | |
case 0: return 'a'; | |
case 1: return 'b'; | |
case 2: return 'c'; | |
case 3: return 'd'; | |
case 4: return 'e'; | |
case 5: return 'f'; | |
case 6: return 'g'; | |
case 7: return 'h'; | |
case 8: return 'i'; | |
case 9: return 'j'; | |
case 10: return 'k'; | |
case 11: return 'l'; | |
case 12: return 'm'; | |
case 13: return 'n'; | |
case 14: return 'o'; | |
case 15: return 'p'; | |
case 16: return 'q'; | |
case 17: return 'r'; | |
case 18: return 's'; | |
case 19: return 't'; | |
case 20: return 'u'; | |
case 21: return 'v'; | |
case 22: return 'w'; | |
case 23: return 'x'; | |
case 24: return 'y'; | |
case 25: return 'z'; | |
case 26: return '-'; | |
case 27: return '+'; | |
case 28: return '/'; | |
case 29: return '*'; | |
case 30: return '?'; | |
case 31: return '!'; // BASE-32 | |
default: return '\0'; | |
} | |
} | |
const char decompress13thChar(const unsigned long symChar) { | |
switch (symChar) { | |
case 0: return '0'; | |
case 1: return '1'; | |
case 2: return '2'; | |
case 3: return '3'; | |
case 4: return '4'; | |
case 5: return '5'; | |
case 6: return '6'; | |
case 7: return '7'; | |
case 8: return '8'; | |
case 9: return '9'; | |
case 10: return '-'; | |
case 11: return '+'; | |
case 12: return '/'; | |
case 13: return '*'; | |
case 14: return '?'; | |
case 15: return '!'; // BASE-16 | |
default: return '\0'; | |
} | |
} | |
char* decompressSymbol(const unsigned long x) { | |
unsigned long rem = x; | |
char* symbol = (char*)calloc(MAX_SYMBOL_LENGTH + 1, sizeof(char)); | |
unsigned char i = 0; | |
unsigned char base = BASE; | |
unsigned char zeros = 0; | |
const char (*decompress)(const unsigned long) = decompressChar; | |
do { | |
if (i == MAX_SYMBOL_LENGTH - 1) { | |
base = DIGIT_13_BASE; | |
decompress = decompress13thChar; | |
if (zeros > 1) { | |
i = i - zeros; | |
*((char*)((unsigned long)symbol+i)) = '_'; | |
i++; | |
} | |
} | |
char c = decompress(rem % base); | |
switch (c) { | |
case 'a': zeros++; break; | |
default: zeros=0; break; | |
} | |
*((char*)((unsigned long)symbol+i)) = c; | |
rem /= base; | |
i++; | |
} while (rem > 0); | |
*((char*)((unsigned long)symbol+i)) = '\0'; | |
return symbol; | |
} | |
void compressChar_is_left_inverse_of_decompressChar(void) { | |
unsigned long x = 0; | |
unsigned long failed = 0; | |
for (; x < BASE; x++) { | |
unsigned long actual = compressChar(decompressChar(x)); | |
if (actual != x) { | |
printf("%lu != %lu\n", actual, x); | |
failed++; | |
} | |
} | |
// Make sure BASE was actually the cutoff point. A full-blown property-based | |
// test is probably overkill here. | |
for (; x < BASE*2; x++) { | |
char actual = decompressChar(x); | |
if (actual != '\0') { | |
printf("%c != NUL\n", actual); | |
failed++; | |
} | |
} | |
if (failed) { | |
printf("%s: %lu/%lu failed\n", __func__, failed, x); | |
} else { | |
printf("%s: 100%% (%lu) passed\n", __func__, x); | |
} | |
} | |
void compress13thChar_is_left_inverse_of_decompress13thChar(void) { | |
unsigned long x = 0; | |
unsigned long failed = 0; | |
for (; x < DIGIT_13_BASE; x++) { | |
unsigned long actual = compress13thChar(decompress13thChar(x)); | |
if (actual != x) { | |
printf("%lu != %lu\n", actual, x); | |
} | |
} | |
// Make sure DIGIT_13_BASE was actually the cutoff point. A full-blown | |
// property-based test is probably overkill here. | |
for (; x < DIGIT_13_BASE*2; x++) { | |
char actual = decompress13thChar(x); | |
if (actual != '\0') { | |
printf("%c != NUL\n", actual); | |
} | |
} | |
if (failed) { | |
printf("%s: %lu/%lu failed\n", __func__, failed, x); | |
} else { | |
printf("%s: 100%% (%lu) passed\n", __func__, x); | |
} | |
} | |
void compressSymbol_is_left_inverse_of_decompressSymbol(void) { | |
unsigned long failed = 0; | |
unsigned long i = 0; | |
for (; i < 1000; i++) { | |
unsigned long x = rand(); | |
char* c = decompressSymbol(x); | |
MaybeULong actual = compressSymbol(c); | |
if (actual.nothing) { | |
printf("Unable to decompress (from %lu): %s\n", x, c); | |
failed++; | |
} else if (actual.x != x) { | |
printf("%lu != %lu\n", actual.x, x); | |
failed++; | |
} | |
free(c); | |
} | |
for (; i < 2000; i++) { | |
unsigned long x = rand() * rand(); | |
char* c = decompressSymbol(x); | |
MaybeULong actual = compressSymbol(c); | |
if (actual.nothing) { | |
printf("Unable to decompress (from %lu): %s\n", x, c); | |
failed++; | |
} else if (actual.x != x) { | |
printf("%lu != %lu\n", actual.x, x); | |
failed++; | |
} | |
free(c); | |
} | |
if (failed) { | |
printf("%s: %lu/%lu failed\n", __func__, failed, i); | |
} else { | |
printf("%s: 100%% (%lu) passed\n", __func__, i); | |
} | |
} | |
void check(void) { | |
compressChar_is_left_inverse_of_decompressChar(); | |
compress13thChar_is_left_inverse_of_decompress13thChar(); | |
compressSymbol_is_left_inverse_of_decompressSymbol(); | |
} | |
int main(int argc, char *argv[]) { | |
switch (argc) { | |
case 3: | |
if (strcmp(argv[1], "compress") == 0) { | |
MaybeULong m = compressSymbol(argv[2]); | |
if (m.nothing) return 1; | |
printf("%lu\n", m.x); | |
break; | |
} else if (strcmp(argv[1], "decompress") == 0) { | |
char* symbol = decompressSymbol(strtoul(argv[2], 0, 10)); | |
printf("%s\n", symbol); | |
free(symbol); | |
break; | |
} | |
case 2: | |
if (strcmp(argv[1], "check") == 0) { | |
check(); | |
break; | |
} | |
default: | |
{ | |
char* symbol = decompressSymbol(ULONG_MAX); | |
printf("%s\n", symbol); | |
free(symbol); | |
break; | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment