Kazark · April 16, 2020 01:32
diff --git a/main.c b/main.c
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 typedef struct MaybeULong {
  const unsigned long nothing;
  const unsigned long x;
 } MaybeULong;

 const MaybeULong justULong(const unsigned long x) {
  MaybeULong j = { .nothing = 0, .x = x };
  return j;
 }

 const MaybeULong nothingULong = { .nothing = 1, .x = 0 };

 /*
  I assert:
    1. Having Lisp/Prolog-style symbols in your language is fantastic.
    2. Having a global symbol table sucks. Giant global state. Blech.
    3. Interning strings for symbols feels heavy.

  So, can't we just represent symbols by words? (We'll assume that modern
  hardware is moving enough toward 64-bit that we can safely equate "word" and
  "64 bits".) Can't we just have some deterministic algorithm to convert back
  and forth? The answer is, of course, "yes"; the real question is not the
  tractability, but how to do this without driving the users of the language
  crazy---bearing in mind, of course, that this is intended as a somewhat
  low-level language for bootstrapping the system, and while it needs to be
  readable and writable, it does not need to be anyone's favorite language,
  unless a minimalism nerd, who won't mind a little bit of inconvenience anyhow.

  It is desirable for the algorithm to be straightforward since we are going for
  minimalism and speed. It should not be excessively difficult to implement, and
  it should run fast, since symbols are going to be bread and butter. The
  obvious thing to do is some human-friendly base-X encoding. Human-friendly,
  not in the sense that the programmer can easily convert to the decimal form in
  his mind (for that should not be important), but in that it provides you
  enough characters to form relatively pleasant identifiers. A base-X encoding,
  because that will be lossless. But what base should be used?

  The primary factors when choosing a base are:
    1. The number of characters available to the programmer in identifiers.
    2. The maximum length of identifiers that will fit into 64 bits.

  However, there is a further kink in that it would be nice to have the
  syntactic convenience available in most languages, that identifiers are
  distinguishable from numbers by the first character being non-numeric, despite
  numeric symbols being allowed.

  1    2    3    4    5    6    7    8    9    10   11   12   13
  2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁴
  5    10   15   20   25   30   35   40   45   50   55   60   64

  Little-endian. Twelve 32-character digits; the thirteenth is 16 characters.

  An underscore can be used as syntactic sugar to skip specifying the rest of
  the base-32 digits, i.e. populate them with a's (zeros), to allow an easy way
  to name identifiers with numbers at the end, e.g. `x_0` and `x_1`, which will
  be equivalent to `xaaaaaaaaaaa0` and `xaaaaaaaaaaa1`, respectively.
 */

 #define BASE 32
 #define MAX_SYMBOL_LENGTH 13
 #define DIGIT_13_BASE 16

 const unsigned long compressChar(const char symChar) {
  switch (symChar) {
  case 'a': return  0;
  case 'b': return  1;
  case 'c': return  2;
  case 'd': return  3;
  case 'e': return  4;
  case 'f': return  5;
  case 'g': return  6;
  case 'h': return  7;
  case 'i': return  8;
  case 'j': return  9;
  case 'k': return 10;
  case 'l': return 11;
  case 'm': return 12;
  case 'n': return 13;
  case 'o': return 14;
  case 'p': return 15;
  case 'q': return 16;
  case 'r': return 17;
  case 's': return 18;
  case 't': return 19;
  case 'u': return 20;
  case 'v': return 21;
  case 'w': return 22;
  case 'x': return 23;
  case 'y': return 24;
  case 'z': return 25;
  case '-': return 26;
  case '+': return 27;
  case '/': return 28;
  case '*': return 29;
  case '?': return 30;
  case '!': return 31;
  default:  return BASE;
  }
 }

 const unsigned long compress13thChar(const char symChar) {
  switch (symChar) {
  case '0': return  0;
  case '1': return  1;
  case '2': return  2;
  case '3': return  3;
  case '4': return  4;
  case '5': return  5;
  case '6': return  6;
  case '7': return  7;
  case '8': return  8;
  case '9': return  9;
  case '-': return 10;
  case '+': return 11;
  case '/': return 12;
  case '*': return 13;
  case '?': return 14;
  case '!': return 15;
  default:  return DIGIT_13_BASE;
  }
 }

 const MaybeULong compressSymbol(const char* const symbol) {
  unsigned long acc = 0; // `acc` is our accumulator
  unsigned long c = 0; // `c` is our loop counter
  unsigned long i = 1; // `i` is our digit power counter
  unsigned long base = BASE;
  // Compression function is different for the 13th digit
  const unsigned long (*compress)(const char) = compressChar;
  // Iterate through the c-string.
  while (*(c+symbol)) {
    if (compress == compress13thChar) {
      return nothingULong; // Prevent overflows
    } else if (c == MAX_SYMBOL_LENGTH - 1) {
      compress = compress13thChar;
    }
    if (compress != compress13thChar && *(c+symbol) == '_') {
      for (unsigned long j = c; j < MAX_SYMBOL_LENGTH - 1; j++) {
        // Empower the digit counter
        i *= base;
      }
      // Increment the loop counter
      c++;
      compress = compress13thChar;
      base = DIGIT_13_BASE;
    }
    // Convert a single symbol character to its numeric value
    unsigned long x = compress(*(c+symbol));
    // The character is not a valid symbol character
    if (x >= base) return nothingULong;
    // Accumulate
    acc += i * x;
    // Increment the loop counter
    c++;
    // Empower the digit counter
    i *= base;
    if (c == MAX_SYMBOL_LENGTH - 1) {
      base = DIGIT_13_BASE;
    }
  }
  // If we had a problem we would have short-circuited before now
  return justULong(acc);
 }

 const char decompressChar(const unsigned long symChar) {
  switch (symChar) {
  case  0: return 'a';
  case  1: return 'b';
  case  2: return 'c';
  case  3: return 'd';
  case  4: return 'e';
  case  5: return 'f';
  case  6: return 'g';
  case  7: return 'h';
  case  8: return 'i';
  case  9: return 'j';
  case 10: return 'k';
  case 11: return 'l';
  case 12: return 'm';
  case 13: return 'n';
  case 14: return 'o';
  case 15: return 'p';
  case 16: return 'q';
  case 17: return 'r';
  case 18: return 's';
  case 19: return 't';
  case 20: return 'u';
  case 21: return 'v';
  case 22: return 'w';
  case 23: return 'x';
  case 24: return 'y';
  case 25: return 'z';
  case 26: return '-';
  case 27: return '+';
  case 28: return '/';
  case 29: return '*';
  case 30: return '?';
  case 31: return '!'; // BASE-32
  default: return '\0';
  }
 }

 const char decompress13thChar(const unsigned long symChar) {
  switch (symChar) {
  case  0: return '0';
  case  1: return '1';
  case  2: return '2';
  case  3: return '3';
  case  4: return '4';
  case  5: return '5';
  case  6: return '6';
  case  7: return '7';
  case  8: return '8';
  case  9: return '9';
  case 10: return '-';
  case 11: return '+';
  case 12: return '/';
  case 13: return '*';
  case 14: return '?';
  case 15: return '!'; // BASE-16
  default: return '\0';
  }
 }

 char* decompressSymbol(const unsigned long x) {
  unsigned long rem = x;
  char* symbol = (char*)calloc(MAX_SYMBOL_LENGTH + 1, sizeof(char));
  unsigned char i = 0;
  unsigned char base = BASE;
  unsigned char zeros = 0;
  const char (*decompress)(const unsigned long) = decompressChar;
  do {
    if (i == MAX_SYMBOL_LENGTH - 1) {
      base = DIGIT_13_BASE;
      decompress = decompress13thChar;
      if (zeros > 1) {
        i = i - zeros;
        *((char*)((unsigned long)symbol+i)) = '_';
        i++;
      }
    }
    char c = decompress(rem % base);
    switch (c) {
    case 'a': zeros++; break;
    default: zeros=0; break;
    }
    *((char*)((unsigned long)symbol+i)) = c;
    rem /= base;
    i++;
  } while (rem > 0);
  *((char*)((unsigned long)symbol+i)) = '\0';
  return symbol;
 }

 void compressChar_is_left_inverse_of_decompressChar(void) {
  unsigned long x = 0;
  unsigned long failed = 0;
  for (; x < BASE; x++) {
    unsigned long actual = compressChar(decompressChar(x));
    if (actual != x) {
      printf("%lu != %lu\n", actual, x);
      failed++;
    }
  }
  // Make sure BASE was actually the cutoff point. A full-blown property-based
  // test is probably overkill here.
  for (; x < BASE*2; x++) {
    char actual = decompressChar(x);
    if (actual != '\0') {
      printf("%c != NUL\n", actual);
      failed++;
    }
  }
  if (failed) {
    printf("%s: %lu/%lu failed\n", __func__, failed, x);
  } else {
    printf("%s: 100%% (%lu) passed\n", __func__, x);
  }
 }

 void compress13thChar_is_left_inverse_of_decompress13thChar(void) {
  unsigned long x = 0;
  unsigned long failed = 0;
  for (; x < DIGIT_13_BASE; x++) {
    unsigned long actual = compress13thChar(decompress13thChar(x));
    if (actual != x) {
      printf("%lu != %lu\n", actual, x);
    }
  }
  // Make sure DIGIT_13_BASE was actually the cutoff point. A full-blown
  // property-based test is probably overkill here.
  for (; x < DIGIT_13_BASE*2; x++) {
    char actual = decompress13thChar(x);
    if (actual != '\0') {
      printf("%c != NUL\n", actual);
    }
  }
  if (failed) {
    printf("%s: %lu/%lu failed\n", __func__, failed, x);
  } else {
    printf("%s: 100%% (%lu) passed\n", __func__, x);
  }
 }

 void compressSymbol_is_left_inverse_of_decompressSymbol(void) {
  unsigned long failed = 0;
  unsigned long i = 0;
  for (; i < 1000; i++) {
    unsigned long x = rand();
    char* c = decompressSymbol(x);
    MaybeULong actual = compressSymbol(c);
    if (actual.nothing) {
      printf("Unable to decompress (from %lu): %s\n", x, c);
      failed++;
    } else if (actual.x != x) {
      printf("%lu != %lu\n", actual.x, x);
      failed++;
    }
    free(c);
  }
  for (; i < 2000; i++) {
    unsigned long x = rand() * rand();
    char* c = decompressSymbol(x);
    MaybeULong actual = compressSymbol(c);
    if (actual.nothing) {
      printf("Unable to decompress (from %lu): %s\n", x, c);
      failed++;
    } else if (actual.x != x) {
      printf("%lu != %lu\n", actual.x, x);
      failed++;
    }
    free(c);
  }
  if (failed) {
    printf("%s: %lu/%lu failed\n", __func__, failed, i);
  } else {
    printf("%s: 100%% (%lu) passed\n", __func__, i);
  }
 }

 void check(void) {
  compressChar_is_left_inverse_of_decompressChar();
  compress13thChar_is_left_inverse_of_decompress13thChar();
  compressSymbol_is_left_inverse_of_decompressSymbol();
 }

 int main(int argc, char *argv[]) {
  switch (argc) {
  case 3:
    if (strcmp(argv[1], "compress") == 0) {
      MaybeULong m = compressSymbol(argv[2]);
      if (m.nothing) return 1;
      printf("%lu\n", m.x);
      break;
    } else if (strcmp(argv[1], "decompress") == 0) {
      char* symbol = decompressSymbol(strtoul(argv[2], 0, 10));
      printf("%s\n", symbol);
      free(symbol);
      break;
    }
  case 2:
    if (strcmp(argv[1], "check") == 0) {
      check();
      break;
    }
  default:
    {
      char* symbol = decompressSymbol(ULONG_MAX);
      printf("%s\n", symbol);
      free(symbol);
      break;
    }
  }
  return 0;
 }
	#include <limits.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	typedef struct MaybeULong {
	const unsigned long nothing;
	const unsigned long x;
	} MaybeULong;

	const MaybeULong justULong(const unsigned long x) {
	MaybeULong j = { .nothing = 0, .x = x };
	return j;
	}

	const MaybeULong nothingULong = { .nothing = 1, .x = 0 };

	/*
	I assert:
	1. Having Lisp/Prolog-style symbols in your language is fantastic.
	2. Having a global symbol table sucks. Giant global state. Blech.
	3. Interning strings for symbols feels heavy.

	So, can't we just represent symbols by words? (We'll assume that modern
	hardware is moving enough toward 64-bit that we can safely equate "word" and
	"64 bits".) Can't we just have some deterministic algorithm to convert back
	and forth? The answer is, of course, "yes"; the real question is not the
	tractability, but how to do this without driving the users of the language
	crazy---bearing in mind, of course, that this is intended as a somewhat
	low-level language for bootstrapping the system, and while it needs to be
	readable and writable, it does not need to be anyone's favorite language,
	unless a minimalism nerd, who won't mind a little bit of inconvenience anyhow.

	It is desirable for the algorithm to be straightforward since we are going for
	minimalism and speed. It should not be excessively difficult to implement, and
	it should run fast, since symbols are going to be bread and butter. The
	obvious thing to do is some human-friendly base-X encoding. Human-friendly,
	not in the sense that the programmer can easily convert to the decimal form in
	his mind (for that should not be important), but in that it provides you
	enough characters to form relatively pleasant identifiers. A base-X encoding,
	because that will be lossless. But what base should be used?

	The primary factors when choosing a base are:
	1. The number of characters available to the programmer in identifiers.
	2. The maximum length of identifiers that will fit into 64 bits.

	However, there is a further kink in that it would be nice to have the
	syntactic convenience available in most languages, that identifiers are
	distinguishable from numbers by the first character being non-numeric, despite
	numeric symbols being allowed.

	1 2 3 4 5 6 7 8 9 10 11 12 13
	2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁵ × 2⁴
	5 10 15 20 25 30 35 40 45 50 55 60 64

	Little-endian. Twelve 32-character digits; the thirteenth is 16 characters.

	An underscore can be used as syntactic sugar to skip specifying the rest of
	the base-32 digits, i.e. populate them with a's (zeros), to allow an easy way
	to name identifiers with numbers at the end, e.g. `x_0` and `x_1`, which will
	be equivalent to `xaaaaaaaaaaa0` and `xaaaaaaaaaaa1`, respectively.
	*/

	#define BASE 32
	#define MAX_SYMBOL_LENGTH 13
	#define DIGIT_13_BASE 16

	const unsigned long compressChar(const char symChar) {
	switch (symChar) {
	case 'a': return 0;
	case 'b': return 1;
	case 'c': return 2;
	case 'd': return 3;
	case 'e': return 4;
	case 'f': return 5;
	case 'g': return 6;
	case 'h': return 7;
	case 'i': return 8;
	case 'j': return 9;
	case 'k': return 10;
	case 'l': return 11;
	case 'm': return 12;
	case 'n': return 13;
	case 'o': return 14;
	case 'p': return 15;
	case 'q': return 16;
	case 'r': return 17;
	case 's': return 18;
	case 't': return 19;
	case 'u': return 20;
	case 'v': return 21;
	case 'w': return 22;
	case 'x': return 23;
	case 'y': return 24;
	case 'z': return 25;
	case '-': return 26;
	case '+': return 27;
	case '/': return 28;
	case '*': return 29;
	case '?': return 30;
	case '!': return 31;
	default: return BASE;
	}
	}

	const unsigned long compress13thChar(const char symChar) {
	switch (symChar) {
	case '0': return 0;
	case '1': return 1;
	case '2': return 2;
	case '3': return 3;
	case '4': return 4;
	case '5': return 5;
	case '6': return 6;
	case '7': return 7;
	case '8': return 8;
	case '9': return 9;
	case '-': return 10;
	case '+': return 11;
	case '/': return 12;
	case '*': return 13;
	case '?': return 14;
	case '!': return 15;
	default: return DIGIT_13_BASE;
	}
	}

	const MaybeULong compressSymbol(const char* const symbol) {
	unsigned long acc = 0; // `acc` is our accumulator
	unsigned long c = 0; // `c` is our loop counter
	unsigned long i = 1; // `i` is our digit power counter
	unsigned long base = BASE;
	// Compression function is different for the 13th digit
	const unsigned long (*compress)(const char) = compressChar;
	// Iterate through the c-string.
	while (*(c+symbol)) {
	if (compress == compress13thChar) {
	return nothingULong; // Prevent overflows
	} else if (c == MAX_SYMBOL_LENGTH - 1) {
	compress = compress13thChar;
	}
	if (compress != compress13thChar && *(c+symbol) == '_') {
	for (unsigned long j = c; j < MAX_SYMBOL_LENGTH - 1; j++) {
	// Empower the digit counter
	i *= base;
	}
	// Increment the loop counter
	c++;
	compress = compress13thChar;
	base = DIGIT_13_BASE;
	}
	// Convert a single symbol character to its numeric value
	unsigned long x = compress(*(c+symbol));
	// The character is not a valid symbol character
	if (x >= base) return nothingULong;
	// Accumulate
	acc += i * x;
	// Increment the loop counter
	c++;
	// Empower the digit counter
	i *= base;
	if (c == MAX_SYMBOL_LENGTH - 1) {
	base = DIGIT_13_BASE;
	}
	}
	// If we had a problem we would have short-circuited before now
	return justULong(acc);
	}

	const char decompressChar(const unsigned long symChar) {
	switch (symChar) {
	case 0: return 'a';
	case 1: return 'b';
	case 2: return 'c';
	case 3: return 'd';
	case 4: return 'e';
	case 5: return 'f';
	case 6: return 'g';
	case 7: return 'h';
	case 8: return 'i';
	case 9: return 'j';
	case 10: return 'k';
	case 11: return 'l';
	case 12: return 'm';
	case 13: return 'n';
	case 14: return 'o';
	case 15: return 'p';
	case 16: return 'q';
	case 17: return 'r';
	case 18: return 's';
	case 19: return 't';
	case 20: return 'u';
	case 21: return 'v';
	case 22: return 'w';
	case 23: return 'x';
	case 24: return 'y';
	case 25: return 'z';
	case 26: return '-';
	case 27: return '+';
	case 28: return '/';
	case 29: return '*';
	case 30: return '?';
	case 31: return '!'; // BASE-32
	default: return '\0';
	}
	}

	const char decompress13thChar(const unsigned long symChar) {
	switch (symChar) {
	case 0: return '0';
	case 1: return '1';
	case 2: return '2';
	case 3: return '3';
	case 4: return '4';
	case 5: return '5';
	case 6: return '6';
	case 7: return '7';
	case 8: return '8';
	case 9: return '9';
	case 10: return '-';
	case 11: return '+';
	case 12: return '/';
	case 13: return '*';
	case 14: return '?';
	case 15: return '!'; // BASE-16
	default: return '\0';
	}
	}

	char* decompressSymbol(const unsigned long x) {
	unsigned long rem = x;
	char* symbol = (char*)calloc(MAX_SYMBOL_LENGTH + 1, sizeof(char));
	unsigned char i = 0;
	unsigned char base = BASE;
	unsigned char zeros = 0;
	const char (*decompress)(const unsigned long) = decompressChar;
	do {
	if (i == MAX_SYMBOL_LENGTH - 1) {
	base = DIGIT_13_BASE;
	decompress = decompress13thChar;
	if (zeros > 1) {
	i = i - zeros;
	((char)((unsigned long)symbol+i)) = '_';
	i++;
	}
	}
	char c = decompress(rem % base);
	switch (c) {
	case 'a': zeros++; break;
	default: zeros=0; break;
	}
	((char)((unsigned long)symbol+i)) = c;
	rem /= base;
	i++;
	} while (rem > 0);
	((char)((unsigned long)symbol+i)) = '\0';
	return symbol;
	}

	void compressChar_is_left_inverse_of_decompressChar(void) {
	unsigned long x = 0;
	unsigned long failed = 0;
	for (; x < BASE; x++) {
	unsigned long actual = compressChar(decompressChar(x));
	if (actual != x) {
	printf("%lu != %lu\n", actual, x);
	failed++;
	}
	}
	// Make sure BASE was actually the cutoff point. A full-blown property-based
	// test is probably overkill here.
	for (; x < BASE*2; x++) {
	char actual = decompressChar(x);
	if (actual != '\0') {
	printf("%c != NUL\n", actual);
	failed++;
	}
	}
	if (failed) {
	printf("%s: %lu/%lu failed\n", __func__, failed, x);
	} else {
	printf("%s: 100%% (%lu) passed\n", __func__, x);
	}
	}

	void compress13thChar_is_left_inverse_of_decompress13thChar(void) {
	unsigned long x = 0;
	unsigned long failed = 0;
	for (; x < DIGIT_13_BASE; x++) {
	unsigned long actual = compress13thChar(decompress13thChar(x));
	if (actual != x) {
	printf("%lu != %lu\n", actual, x);
	}
	}
	// Make sure DIGIT_13_BASE was actually the cutoff point. A full-blown
	// property-based test is probably overkill here.
	for (; x < DIGIT_13_BASE*2; x++) {
	char actual = decompress13thChar(x);
	if (actual != '\0') {
	printf("%c != NUL\n", actual);
	}
	}
	if (failed) {
	printf("%s: %lu/%lu failed\n", __func__, failed, x);
	} else {
	printf("%s: 100%% (%lu) passed\n", __func__, x);
	}
	}

	void compressSymbol_is_left_inverse_of_decompressSymbol(void) {
	unsigned long failed = 0;
	unsigned long i = 0;
	for (; i < 1000; i++) {
	unsigned long x = rand();
	char* c = decompressSymbol(x);
	MaybeULong actual = compressSymbol(c);
	if (actual.nothing) {
	printf("Unable to decompress (from %lu): %s\n", x, c);
	failed++;
	} else if (actual.x != x) {
	printf("%lu != %lu\n", actual.x, x);
	failed++;
	}
	free(c);
	}
	for (; i < 2000; i++) {
	unsigned long x = rand() * rand();
	char* c = decompressSymbol(x);
	MaybeULong actual = compressSymbol(c);
	if (actual.nothing) {
	printf("Unable to decompress (from %lu): %s\n", x, c);
	failed++;
	} else if (actual.x != x) {
	printf("%lu != %lu\n", actual.x, x);
	failed++;
	}
	free(c);
	}
	if (failed) {
	printf("%s: %lu/%lu failed\n", __func__, failed, i);
	} else {
	printf("%s: 100%% (%lu) passed\n", __func__, i);
	}
	}

	void check(void) {
	compressChar_is_left_inverse_of_decompressChar();
	compress13thChar_is_left_inverse_of_decompress13thChar();
	compressSymbol_is_left_inverse_of_decompressSymbol();
	}

	int main(int argc, char *argv[]) {
	switch (argc) {
	case 3:
	if (strcmp(argv[1], "compress") == 0) {
	MaybeULong m = compressSymbol(argv[2]);
	if (m.nothing) return 1;
	printf("%lu\n", m.x);
	break;
	} else if (strcmp(argv[1], "decompress") == 0) {
	char* symbol = decompressSymbol(strtoul(argv[2], 0, 10));
	printf("%s\n", symbol);
	free(symbol);
	break;
	}
	case 2:
	if (strcmp(argv[1], "check") == 0) {
	check();
	break;
	}
	default:
	{
	char* symbol = decompressSymbol(ULONG_MAX);
	printf("%s\n", symbol);
	free(symbol);
	break;
	}
	}
	return 0;
	}
No results found