DavidJCobb · February 24, 2025 07:09
diff --git a/README.md b/README.md
diff --git a/ascii85.js b/ascii85.js

 class Ascii85Codec {
   static CANONICAL_CHARSET = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu";
   
   // Replaces all canonical characters that would be escaped when stored in 
   // Chrome's localStorage.
   //
   //    " -> v
   //    < -> w
   //    \ -> x
   //
   static STORAGE_CHARSET = "!v#$%&'()*+,-./0123456789:;w=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[x]^_`abcdefghijklmnopqrstu";
   
   static SPACE_RUN_CHAR = 'y';
   static ZERO_RUN_CHAR  = 'z';
   
   constructor(charset) {
      if (charset) {
         if (charset.length != 85)
            throw new Error("Invalid character set.");
      } else {
         charset = Ascii85Codec.CANONICAL_CHARSET;
      }
      
      this.charset = charset;
      this.table   = new Map(); // cache for faster decoding
      for(let i = 0; i < charset.length; ++i) {
         this.table.set(charset[i], i);
      }
      
      //
      // Pre-construct a regex for faster validation of encoded inputs.
      //
      {
         const CODE_a = ("a").charCodeAt(0);
         const CODE_z = CODE_a + 25;
         const CODE_A = ("A").charCodeAt(0);
         const CODE_Z = CODE_A + 25;
         const CODE_0 = ("0").charCodeAt(0);
         const CODE_9 = CODE_0 + 9;
         
         let character_class = "";
         for(let i = 0; i < charset.length; ++i) {
            let cc = charset.charCodeAt(i);
            if (
               (cc >= CODE_a && cc <= CODE_z) ||
               (cc >= CODE_A && cc <= CODE_Z) ||
               (cc >= CODE_0 && cc <= CODE_9)
            ) {
               character_class += charset[i];
               continue;
            }
            character_class += "\\x" + cc.toString(16).padStart(2, '0');
         }
         
         let abbreviation_class = Ascii85Codec.SPACE_RUN_CHAR + Ascii85Codec.ZERO_RUN_CHAR;
         
         //
         // First half: Allow any combination of single 'y' or 'z' markers, or 
         // groups of five encoding characters. Allow the string to end with 
         // fewer than five consecutive encoding markers.
         //
         let regex_src = `^(?:[${abbreviation_class}]|[${character_class}]{5})+[${character_class}]{0,4}$`;
         this.validation_regex = new RegExp(regex_src);
      }
   }
   
   /*String*/ encode(/*const DataView*/ view) /*const*/ {
      const ZERO_RUN_CHAR  = Ascii85Codec.ZERO_RUN_CHAR;
      const SPACE_RUN_CHAR = Ascii85Codec.SPACE_RUN_CHAR;
      
      let out = "";
      let i;
      for(i = 0; i + 3 < view.byteLength; i += 4) {
         let dword = view.getUint32(i, false);
         if (dword == 0) {
            out += ZERO_RUN_CHAR;
            continue;
         }
         if (dword == 0x20202020) {
            out += SPACE_RUN_CHAR;
            continue;
         }
         
         let chars = [" ", " ", " ", " ", " "];
         for(let j = 0; j < 5; ++j) {
            let unit = dword % 85;
            chars[5 - j - 1] = this.charset[unit];
            dword = (dword - unit) / 85;
         }
         out += chars.join("");
      }
      let rem = view.byteLength - i;
      if (rem) {
         //
         // When the source data is not a multiple of four bytes, we must 
         // pad it. We can exclude this padding from the output string, 
         // though.
         //
         let dword = 0;
         for(let j = 0; j < rem; ++j) {
            let byte = view.getUint8(i + j);
            dword = (dword << 8) | byte;
         }
         dword <<= (4 - rem) * 8;
         if (dword < 0) { // JS bitwise operators produce int32_t results
            dword += 0xFFFFFFFF + 1;
         }
         //
         // Pack the padded dword.
         //
         let chars = [" ", " ", " ", " ", " "];
         for(let j = 0; j < 5; ++j) {
            let unit = dword % 85;
            chars[5 - j - 1] = this.charset[unit];
            dword = (dword - unit) / 85;
         }
         out += chars.join("").substring(0, rem + 1);
      }
      return out;
   }
   
   /*bool*/ validate(/*String*/ encoded) /*const*/ {
      if (encoded === "")
         return true; // an empty string is valid
      return this.validation_regex.test(encoded);
   }
   
   //
   // NOTE: The ArrayBuffer this allocates always has a size that is a multiple 
   //       of four, with zero-padding as needed; but the DataView that this 
   //       returns should be of exact length. I am here (micro)optimizing for 
   //       speed, wasting at most three bytes of memory per operation.
   //
   /*DataView*/ decode(/*const String*/ str) /*const*/ {
      const ZERO_RUN_CHAR  = Ascii85Codec.ZERO_RUN_CHAR;
      const SPACE_RUN_CHAR = Ascii85Codec.SPACE_RUN_CHAR;
      
      let pad; // number of characters missing
      let size;
      {
         let extras = 0;
         //
         // We need to account for abbreviated tokens, i.e. single characters that 
         // represent whole DWORDs.
         //
         let i = str.indexOf(ZERO_RUN_CHAR);
         while (i >= 0) {
            ++extras;
            i = str.indexOf(ZERO_RUN_CHAR, i + 1);
         }
         i = str.indexOf(SPACE_RUN_CHAR);
         while (i >= 0) {
            ++extras;
            i = str.indexOf(SPACE_RUN_CHAR, i + 1);
         }
         //
         // We also need to account for missing characters (i.e. the number of chars 
         // by which we must pad (or simulate the padding of) the input string). The 
         // encoding design is such that every sequence of five characters (besides 
         // abbreviated tokens) represents four bytes; ergo the string length (minus 
         // abbreviated tokens) will be padded to a multiple of five.
         //
         pad = (str.length - extras) % 5;
         if (pad)
            pad = 5 - pad;
         
         size = (str.length - extras + pad) * 0.8 + (extras * 4);
      }
      
      let buffer = new ArrayBuffer(size);
      let view   = new DataView(buffer);
      
      let i;
      let j = 0;
      for(i = 0; i + 4 < str.length; j += 4) {
         let c = str[i];
         if (c == ZERO_RUN_CHAR) {
            view.setUint32(j, 0, false);
            ++i;
            continue;
         }
         if (c == SPACE_RUN_CHAR) {
            view.setUint32(j, 0x20202020, false);
            ++i;
            continue;
         }
         
         let dword = 0;
         dword += this.table.get(c)          * (85**4);
         dword += this.table.get(str[i + 1]) * (85**3);
         dword += this.table.get(str[i + 2]) * (85**2);
         dword += this.table.get(str[i + 3]) *  85;
         dword += this.table.get(str[i + 4]);
         i += 5;
         
         view.setUint32(j, dword, false);
      }
      if (pad) {
         let chunk = str.substring(str.length - (5 - pad)).padEnd(5, this.charset[this.charset.length - 1]);
         
         let dword = 0;
         dword += this.table.get(chunk[0]) * (85**4);
         dword += this.table.get(chunk[1]) * (85**3);
         dword += this.table.get(chunk[2]) * (85**2);
         dword += this.table.get(chunk[3]) *  85;
         dword += this.table.get(chunk[4]);
         
         view.setUint32(j, dword, false);
         
         // Resize the output dataview to ignore the padding.
         view = new DataView(buffer, 0, size - pad);
      }
      return view;
   }
 };
diff --git a/testcases.html b/testcases.html
 <!doctype html>
 <html>
   <head>
      <title>Ascii85 testcases</title>
      <script src="ascii85.js"></script>
      <script>
 {
   let codec = new Ascii85Codec();
   
   function print_buffer(name, view) {
      let text = name + ":";
      for(let i = 0; i < view.byteLength; ++i)
         text += ' ' + view.getUint8(i).toString(16).toUpperCase().padStart(2, '0');
      console.log(text);
   }
   
   function test(bytes) {
      console.group();
      try {
         let buf  = new ArrayBuffer(bytes.length);
         let view = new DataView(buf);
         for(let i = 0; i < bytes.length; ++i)
            view.setUint8(i, bytes[i]);
      
         let enc = codec.encode(view);
         let dec = codec.decode(enc);
         
         print_buffer("test input", view);
         console.log(`encoded (${enc.length}): ${enc}`);
         print_buffer("round-tripped", dec);
         print_buffer("round-tripped (including scratch)", new DataView(dec.buffer));
         
         if (dec.byteLength != view.byteLength) {
            throw new Error("round-trip length changed");
         }
         for(let i = 0; i < bytes.length; ++i) {
            if (dec.getUint8(i) != bytes[i])
               throw new Error("round-trip content changed");
         }
         if (!codec.validate(enc)) {
            throw new Error("encoded text did not validate");
         }
      } catch (e) {
         console.groupEnd();
         throw e;
      }
      console.groupEnd();
   }
   
   test([0x11, 0x22, 0x33, 0x44]);
   test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]);
   
   test([0x11, 0x22, 0x33, 0x44, 0x55]);
   test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66]);
   test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77]);
   
   test([0x11, 0x22, 0x33, 0x44, 0x00, 0x66, 0x77, 0x00, 0x99, 0xAA, 0xBB, 0xCC]);
   test([0x11, 0x22, 0x33, 0x44, 0x00, 0x00, 0x00, 0x00, 0x99, 0xAA, 0xBB, 0xCC]);
   test([0x11, 0x22, 0x33, 0x44, 0x20, 0x20, 0x20, 0x20, 0x99, 0xAA, 0xBB, 0xCC]);
   
   function test_validation(str, desired) {
      if (codec.validate(str) != desired) {
         let error = desired ? "didn't validate when it should have" : "validated when it shouldn't have";
         throw new Error(`string "${str}" ${error}`);
      }
      console.log(`${str} ${desired ? "validated" : "was rejected"}`);
   }
   
   test_validation("!!!!!", true);
   test_validation("z!!!!!", true);
   test_validation("!!!!!z", true);
   test_validation("!!z!!", false);
   test_validation("z!!!!", true);
 }
      </script>
   </head>
 </html>

	class Ascii85Codec {
	static CANONICAL_CHARSET = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu";

	// Replaces all canonical characters that would be escaped when stored in
	// Chrome's localStorage.
	//
	// " -> v
	// < -> w
	// \ -> x
	//
	static STORAGE_CHARSET = "!v#$%&'()*+,-./0123456789:;w=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[x]^_`abcdefghijklmnopqrstu";

	static SPACE_RUN_CHAR = 'y';
	static ZERO_RUN_CHAR = 'z';

	constructor(charset) {
	if (charset) {
	if (charset.length != 85)
	throw new Error("Invalid character set.");
	} else {
	charset = Ascii85Codec.CANONICAL_CHARSET;
	}

	this.charset = charset;
	this.table = new Map(); // cache for faster decoding
	for(let i = 0; i < charset.length; ++i) {
	this.table.set(charset[i], i);
	}

	//
	// Pre-construct a regex for faster validation of encoded inputs.
	//
	{
	const CODE_a = ("a").charCodeAt(0);
	const CODE_z = CODE_a + 25;
	const CODE_A = ("A").charCodeAt(0);
	const CODE_Z = CODE_A + 25;
	const CODE_0 = ("0").charCodeAt(0);
	const CODE_9 = CODE_0 + 9;

	let character_class = "";
	for(let i = 0; i < charset.length; ++i) {
	let cc = charset.charCodeAt(i);
	if (
	(cc >= CODE_a && cc <= CODE_z) \|\|
	(cc >= CODE_A && cc <= CODE_Z) \|\|
	(cc >= CODE_0 && cc <= CODE_9)
	) {
	character_class += charset[i];
	continue;
	}
	character_class += "\\x" + cc.toString(16).padStart(2, '0');
	}

	let abbreviation_class = Ascii85Codec.SPACE_RUN_CHAR + Ascii85Codec.ZERO_RUN_CHAR;

	//
	// First half: Allow any combination of single 'y' or 'z' markers, or
	// groups of five encoding characters. Allow the string to end with
	// fewer than five consecutive encoding markers.
	//
	let regex_src = `^(?:[${abbreviation_class}]\|[${character_class}]{5})+[${character_class}]{0,4}$`;
	this.validation_regex = new RegExp(regex_src);
	}
	}

	/String/ encode(/const DataView/ view) /const/ {
	const ZERO_RUN_CHAR = Ascii85Codec.ZERO_RUN_CHAR;
	const SPACE_RUN_CHAR = Ascii85Codec.SPACE_RUN_CHAR;

	let out = "";
	let i;
	for(i = 0; i + 3 < view.byteLength; i += 4) {
	let dword = view.getUint32(i, false);
	if (dword == 0) {
	out += ZERO_RUN_CHAR;
	continue;
	}
	if (dword == 0x20202020) {
	out += SPACE_RUN_CHAR;
	continue;
	}

	let chars = [" ", " ", " ", " ", " "];
	for(let j = 0; j < 5; ++j) {
	let unit = dword % 85;
	chars[5 - j - 1] = this.charset[unit];
	dword = (dword - unit) / 85;
	}
	out += chars.join("");
	}
	let rem = view.byteLength - i;
	if (rem) {
	//
	// When the source data is not a multiple of four bytes, we must
	// pad it. We can exclude this padding from the output string,
	// though.
	//
	let dword = 0;
	for(let j = 0; j < rem; ++j) {
	let byte = view.getUint8(i + j);
	dword = (dword << 8) \| byte;
	}
	dword <<= (4 - rem) * 8;
	if (dword < 0) { // JS bitwise operators produce int32_t results
	dword += 0xFFFFFFFF + 1;
	}
	//
	// Pack the padded dword.
	//
	let chars = [" ", " ", " ", " ", " "];
	for(let j = 0; j < 5; ++j) {
	let unit = dword % 85;
	chars[5 - j - 1] = this.charset[unit];
	dword = (dword - unit) / 85;
	}
	out += chars.join("").substring(0, rem + 1);
	}
	return out;
	}

	/bool/ validate(/String/ encoded) /const/ {
	if (encoded === "")
	return true; // an empty string is valid
	return this.validation_regex.test(encoded);
	}

	//
	// NOTE: The ArrayBuffer this allocates always has a size that is a multiple
	// of four, with zero-padding as needed; but the DataView that this
	// returns should be of exact length. I am here (micro)optimizing for
	// speed, wasting at most three bytes of memory per operation.
	//
	/DataView/ decode(/const String/ str) /const/ {
	const ZERO_RUN_CHAR = Ascii85Codec.ZERO_RUN_CHAR;
	const SPACE_RUN_CHAR = Ascii85Codec.SPACE_RUN_CHAR;

	let pad; // number of characters missing
	let size;
	{
	let extras = 0;
	//
	// We need to account for abbreviated tokens, i.e. single characters that
	// represent whole DWORDs.
	//
	let i = str.indexOf(ZERO_RUN_CHAR);
	while (i >= 0) {
	++extras;
	i = str.indexOf(ZERO_RUN_CHAR, i + 1);
	}
	i = str.indexOf(SPACE_RUN_CHAR);
	while (i >= 0) {
	++extras;
	i = str.indexOf(SPACE_RUN_CHAR, i + 1);
	}
	//
	// We also need to account for missing characters (i.e. the number of chars
	// by which we must pad (or simulate the padding of) the input string). The
	// encoding design is such that every sequence of five characters (besides
	// abbreviated tokens) represents four bytes; ergo the string length (minus
	// abbreviated tokens) will be padded to a multiple of five.
	//
	pad = (str.length - extras) % 5;
	if (pad)
	pad = 5 - pad;

	size = (str.length - extras + pad) * 0.8 + (extras * 4);
	}

	let buffer = new ArrayBuffer(size);
	let view = new DataView(buffer);

	let i;
	let j = 0;
	for(i = 0; i + 4 < str.length; j += 4) {
	let c = str[i];
	if (c == ZERO_RUN_CHAR) {
	view.setUint32(j, 0, false);
	++i;
	continue;
	}
	if (c == SPACE_RUN_CHAR) {
	view.setUint32(j, 0x20202020, false);
	++i;
	continue;
	}

	let dword = 0;
	dword += this.table.get(c) * (85**4);
	dword += this.table.get(str[i + 1]) * (85**3);
	dword += this.table.get(str[i + 2]) * (85**2);
	dword += this.table.get(str[i + 3]) * 85;
	dword += this.table.get(str[i + 4]);
	i += 5;

	view.setUint32(j, dword, false);
	}
	if (pad) {
	let chunk = str.substring(str.length - (5 - pad)).padEnd(5, this.charset[this.charset.length - 1]);

	let dword = 0;
	dword += this.table.get(chunk[0]) * (85**4);
	dword += this.table.get(chunk[1]) * (85**3);
	dword += this.table.get(chunk[2]) * (85**2);
	dword += this.table.get(chunk[3]) * 85;
	dword += this.table.get(chunk[4]);

	view.setUint32(j, dword, false);

	// Resize the output dataview to ignore the padding.
	view = new DataView(buffer, 0, size - pad);
	}
	return view;
	}
	};
	<!doctype html>
	<html>
	<head>
	<title>Ascii85 testcases</title>
	<script src="ascii85.js"></script>
	<script>
	{
	let codec = new Ascii85Codec();

	function print_buffer(name, view) {
	let text = name + ":";
	for(let i = 0; i < view.byteLength; ++i)
	text += ' ' + view.getUint8(i).toString(16).toUpperCase().padStart(2, '0');
	console.log(text);
	}

	function test(bytes) {
	console.group();
	try {
	let buf = new ArrayBuffer(bytes.length);
	let view = new DataView(buf);
	for(let i = 0; i < bytes.length; ++i)
	view.setUint8(i, bytes[i]);

	let enc = codec.encode(view);
	let dec = codec.decode(enc);

	print_buffer("test input", view);
	console.log(`encoded (${enc.length}): ${enc}`);
	print_buffer("round-tripped", dec);
	print_buffer("round-tripped (including scratch)", new DataView(dec.buffer));

	if (dec.byteLength != view.byteLength) {
	throw new Error("round-trip length changed");
	}
	for(let i = 0; i < bytes.length; ++i) {
	if (dec.getUint8(i) != bytes[i])
	throw new Error("round-trip content changed");
	}
	if (!codec.validate(enc)) {
	throw new Error("encoded text did not validate");
	}
	} catch (e) {
	console.groupEnd();
	throw e;
	}
	console.groupEnd();
	}

	test([0x11, 0x22, 0x33, 0x44]);
	test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]);

	test([0x11, 0x22, 0x33, 0x44, 0x55]);
	test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66]);
	test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77]);

	test([0x11, 0x22, 0x33, 0x44, 0x00, 0x66, 0x77, 0x00, 0x99, 0xAA, 0xBB, 0xCC]);
	test([0x11, 0x22, 0x33, 0x44, 0x00, 0x00, 0x00, 0x00, 0x99, 0xAA, 0xBB, 0xCC]);
	test([0x11, 0x22, 0x33, 0x44, 0x20, 0x20, 0x20, 0x20, 0x99, 0xAA, 0xBB, 0xCC]);

	function test_validation(str, desired) {
	if (codec.validate(str) != desired) {
	let error = desired ? "didn't validate when it should have" : "validated when it shouldn't have";
	throw new Error(`string "${str}" ${error}`);
	}
	console.log(`${str} ${desired ? "validated" : "was rejected"}`);
	}

	test_validation("!!!!!", true);
	test_validation("z!!!!!", true);
	test_validation("!!!!!z", true);
	test_validation("!!z!!", false);
	test_validation("z!!!!", true);
	}
	</script>
	</head>
	</html>