Skip to content

Instantly share code, notes, and snippets.

@Kazark
Created April 16, 2020 01:32
Show Gist options
  • Save Kazark/28d191b5b5c074617c7e1de882537aee to your computer and use it in GitHub Desktop.
Save Kazark/28d191b5b5c074617c7e1de882537aee to your computer and use it in GitHub Desktop.
Usable Lisp-style symbols, without a global symbol table or interning or heavyweight strings
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct MaybeULong {
const unsigned long nothing;
const unsigned long x;
} MaybeULong;
const MaybeULong justULong(const unsigned long x) {
MaybeULong j = { .nothing = 0, .x = x };
return j;
}
const MaybeULong nothingULong = { .nothing = 1, .x = 0 };
/*
I assert:
1. Having Lisp/Prolog-style symbols in your language is fantastic.
2. Having a global symbol table sucks. Giant global state. Blech.
3. Interning strings for symbols feels heavy.
So, can't we just represent symbols by words? (We'll assume that modern
hardware is moving enough toward 64-bit that we can safely equate "word" and
"64 bits".) Can't we just have some deterministic algorithm to convert back
and forth? The answer is, of course, "yes"; the real question is not the
tractability, but how to do this without driving the users of the language
crazy---bearing in mind, of course, that this is intended as a somewhat
low-level language for bootstrapping the system, and while it needs to be
readable and writable, it does not need to be anyone's favorite language,
unless a minimalism nerd, who won't mind a little bit of inconvenience anyhow.
It is desirable for the algorithm to be straightforward since we are going for
minimalism and speed. It should not be excessively difficult to implement, and
it should run fast, since symbols are going to be bread and butter. The
obvious thing to do is some human-friendly base-X encoding. Human-friendly,
not in the sense that the programmer can easily convert to the decimal form in
his mind (for that should not be important), but in that it provides you
enough characters to form relatively pleasant identifiers. A base-X encoding,
because that will be lossless. But what base should be used?
The primary factors when choosing a base are:
1. The number of characters available to the programmer in identifiers.
2. The maximum length of identifiers that will fit into 64 bits.
However, there is a further kink in that it would be nice to have the
syntactic convenience available in most languages, that identifiers are
distinguishable from numbers by the first character being non-numeric, despite
numeric symbols being allowed.
1 2 3 4 5 6 7 8 9 10 11 12 13
2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁴
5 10 15 20 25 30 35 40 45 50 55 60 64
Little-endian. Twelve 32-character digits; the thirteenth is 16 characters.
An underscore can be used as syntactic sugar to skip specifying the rest of
the base-32 digits, i.e. populate them with a's (zeros), to allow an easy way
to name identifiers with numbers at the end, e.g. `x_0` and `x_1`, which will
be equivalent to `xaaaaaaaaaaa0` and `xaaaaaaaaaaa1`, respectively.
*/
#define BASE 32
#define MAX_SYMBOL_LENGTH 13
#define DIGIT_13_BASE 16
const unsigned long compressChar(const char symChar) {
switch (symChar) {
case 'a': return 0;
case 'b': return 1;
case 'c': return 2;
case 'd': return 3;
case 'e': return 4;
case 'f': return 5;
case 'g': return 6;
case 'h': return 7;
case 'i': return 8;
case 'j': return 9;
case 'k': return 10;
case 'l': return 11;
case 'm': return 12;
case 'n': return 13;
case 'o': return 14;
case 'p': return 15;
case 'q': return 16;
case 'r': return 17;
case 's': return 18;
case 't': return 19;
case 'u': return 20;
case 'v': return 21;
case 'w': return 22;
case 'x': return 23;
case 'y': return 24;
case 'z': return 25;
case '-': return 26;
case '+': return 27;
case '/': return 28;
case '*': return 29;
case '?': return 30;
case '!': return 31;
default: return BASE;
}
}
const unsigned long compress13thChar(const char symChar) {
switch (symChar) {
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
case '-': return 10;
case '+': return 11;
case '/': return 12;
case '*': return 13;
case '?': return 14;
case '!': return 15;
default: return DIGIT_13_BASE;
}
}
const MaybeULong compressSymbol(const char* const symbol) {
unsigned long acc = 0; // `acc` is our accumulator
unsigned long c = 0; // `c` is our loop counter
unsigned long i = 1; // `i` is our digit power counter
unsigned long base = BASE;
// Compression function is different for the 13th digit
const unsigned long (*compress)(const char) = compressChar;
// Iterate through the c-string.
while (*(c+symbol)) {
if (compress == compress13thChar) {
return nothingULong; // Prevent overflows
} else if (c == MAX_SYMBOL_LENGTH - 1) {
compress = compress13thChar;
}
if (compress != compress13thChar && *(c+symbol) == '_') {
for (unsigned long j = c; j < MAX_SYMBOL_LENGTH - 1; j++) {
// Empower the digit counter
i *= base;
}
// Increment the loop counter
c++;
compress = compress13thChar;
base = DIGIT_13_BASE;
}
// Convert a single symbol character to its numeric value
unsigned long x = compress(*(c+symbol));
// The character is not a valid symbol character
if (x >= base) return nothingULong;
// Accumulate
acc += i * x;
// Increment the loop counter
c++;
// Empower the digit counter
i *= base;
if (c == MAX_SYMBOL_LENGTH - 1) {
base = DIGIT_13_BASE;
}
}
// If we had a problem we would have short-circuited before now
return justULong(acc);
}
const char decompressChar(const unsigned long symChar) {
switch (symChar) {
case 0: return 'a';
case 1: return 'b';
case 2: return 'c';
case 3: return 'd';
case 4: return 'e';
case 5: return 'f';
case 6: return 'g';
case 7: return 'h';
case 8: return 'i';
case 9: return 'j';
case 10: return 'k';
case 11: return 'l';
case 12: return 'm';
case 13: return 'n';
case 14: return 'o';
case 15: return 'p';
case 16: return 'q';
case 17: return 'r';
case 18: return 's';
case 19: return 't';
case 20: return 'u';
case 21: return 'v';
case 22: return 'w';
case 23: return 'x';
case 24: return 'y';
case 25: return 'z';
case 26: return '-';
case 27: return '+';
case 28: return '/';
case 29: return '*';
case 30: return '?';
case 31: return '!'; // BASE-32
default: return '\0';
}
}
const char decompress13thChar(const unsigned long symChar) {
switch (symChar) {
case 0: return '0';
case 1: return '1';
case 2: return '2';
case 3: return '3';
case 4: return '4';
case 5: return '5';
case 6: return '6';
case 7: return '7';
case 8: return '8';
case 9: return '9';
case 10: return '-';
case 11: return '+';
case 12: return '/';
case 13: return '*';
case 14: return '?';
case 15: return '!'; // BASE-16
default: return '\0';
}
}
char* decompressSymbol(const unsigned long x) {
unsigned long rem = x;
char* symbol = (char*)calloc(MAX_SYMBOL_LENGTH + 1, sizeof(char));
unsigned char i = 0;
unsigned char base = BASE;
unsigned char zeros = 0;
const char (*decompress)(const unsigned long) = decompressChar;
do {
if (i == MAX_SYMBOL_LENGTH - 1) {
base = DIGIT_13_BASE;
decompress = decompress13thChar;
if (zeros > 1) {
i = i - zeros;
*((char*)((unsigned long)symbol+i)) = '_';
i++;
}
}
char c = decompress(rem % base);
switch (c) {
case 'a': zeros++; break;
default: zeros=0; break;
}
*((char*)((unsigned long)symbol+i)) = c;
rem /= base;
i++;
} while (rem > 0);
*((char*)((unsigned long)symbol+i)) = '\0';
return symbol;
}
void compressChar_is_left_inverse_of_decompressChar(void) {
unsigned long x = 0;
unsigned long failed = 0;
for (; x < BASE; x++) {
unsigned long actual = compressChar(decompressChar(x));
if (actual != x) {
printf("%lu != %lu\n", actual, x);
failed++;
}
}
// Make sure BASE was actually the cutoff point. A full-blown property-based
// test is probably overkill here.
for (; x < BASE*2; x++) {
char actual = decompressChar(x);
if (actual != '\0') {
printf("%c != NUL\n", actual);
failed++;
}
}
if (failed) {
printf("%s: %lu/%lu failed\n", __func__, failed, x);
} else {
printf("%s: 100%% (%lu) passed\n", __func__, x);
}
}
void compress13thChar_is_left_inverse_of_decompress13thChar(void) {
unsigned long x = 0;
unsigned long failed = 0;
for (; x < DIGIT_13_BASE; x++) {
unsigned long actual = compress13thChar(decompress13thChar(x));
if (actual != x) {
printf("%lu != %lu\n", actual, x);
}
}
// Make sure DIGIT_13_BASE was actually the cutoff point. A full-blown
// property-based test is probably overkill here.
for (; x < DIGIT_13_BASE*2; x++) {
char actual = decompress13thChar(x);
if (actual != '\0') {
printf("%c != NUL\n", actual);
}
}
if (failed) {
printf("%s: %lu/%lu failed\n", __func__, failed, x);
} else {
printf("%s: 100%% (%lu) passed\n", __func__, x);
}
}
void compressSymbol_is_left_inverse_of_decompressSymbol(void) {
unsigned long failed = 0;
unsigned long i = 0;
for (; i < 1000; i++) {
unsigned long x = rand();
char* c = decompressSymbol(x);
MaybeULong actual = compressSymbol(c);
if (actual.nothing) {
printf("Unable to decompress (from %lu): %s\n", x, c);
failed++;
} else if (actual.x != x) {
printf("%lu != %lu\n", actual.x, x);
failed++;
}
free(c);
}
for (; i < 2000; i++) {
unsigned long x = rand() * rand();
char* c = decompressSymbol(x);
MaybeULong actual = compressSymbol(c);
if (actual.nothing) {
printf("Unable to decompress (from %lu): %s\n", x, c);
failed++;
} else if (actual.x != x) {
printf("%lu != %lu\n", actual.x, x);
failed++;
}
free(c);
}
if (failed) {
printf("%s: %lu/%lu failed\n", __func__, failed, i);
} else {
printf("%s: 100%% (%lu) passed\n", __func__, i);
}
}
void check(void) {
compressChar_is_left_inverse_of_decompressChar();
compress13thChar_is_left_inverse_of_decompress13thChar();
compressSymbol_is_left_inverse_of_decompressSymbol();
}
int main(int argc, char *argv[]) {
switch (argc) {
case 3:
if (strcmp(argv[1], "compress") == 0) {
MaybeULong m = compressSymbol(argv[2]);
if (m.nothing) return 1;
printf("%lu\n", m.x);
break;
} else if (strcmp(argv[1], "decompress") == 0) {
char* symbol = decompressSymbol(strtoul(argv[2], 0, 10));
printf("%s\n", symbol);
free(symbol);
break;
}
case 2:
if (strcmp(argv[1], "check") == 0) {
check();
break;
}
default:
{
char* symbol = decompressSymbol(ULONG_MAX);
printf("%s\n", symbol);
free(symbol);
break;
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment