Skip to content

Instantly share code, notes, and snippets.

@DavidJCobb
Created February 24, 2025 07:09
Show Gist options
  • Save DavidJCobb/45022f672cc987d50b8dfa0870e7b25c to your computer and use it in GitHub Desktop.
Save DavidJCobb/45022f672cc987d50b8dfa0870e7b25c to your computer and use it in GitHub Desktop.
Ascii85 (base-85) codec in JavaScript

Ascii85, also known as base85, is an encoding similar in concept to base64. Where base64 uses four ASCII characters to represent three source bytes (thereby inflating the data size by 33%), ascii85 uses five ASCII characters to represent four source bytes (thereby inflating the data size by 25%).

This script can be used to encode and decode a DataView as ascii85. Spawn an Ascii85Codec with the desired configuration, and then call its member functions as appropriate. In particular, this class should be appropriate for storing a DataView inside of Local Storage with the smallest possible size (if you use the STORAGE_CHARSET offered).

Note that we do not perform any error-checking during the decode step. Ascii85Codec offers a validate member function that can be run on an encoded string to verify that it contains no illegal characters; this can be run prior to decoding in any situation where the input data is untrusted.

Implementation notes

  • The STORAGE_CHARSET is offered as a convenience, to aid with JavaScript that needs to store binary data via the Local Storage API with minimal overhead. Browsers store all such data as a JSON string, so the string representation of the data is what counts toward storage size limits. Chromium uses a null-terminated JSON string (and counts the terminating null), and within stored string values, Chromium escapes double-quotes and and left angle brackets. STORAGE_CHARSET substitutes ", < and \ out in order to avoid the additional overhead of string escape sequences in the serialized JSON (as these would be represented in storage as \", \u003C, and \\).

  • In the encoding step, we pre-create chars as an array of five string values and then replace individual elements. This ensures that the array is packed and initially allocated with the desired size.

  • Validation of an encoded string works by pre-compiling a regular expression to test the input. We here assume that native code will do the job faster than we would. To keep as much complexity out of the regex as possible, empty strings are treated as a special case and not handled by the regex.

  • Decoding always allocates an ArrayBuffer with a length that is a multiple of 8, and we just return a truncated DataView into that buffer. This allows us to write the final chunk with a single setUint32 call, instead of having to rebuild a padded DWORD and then decompose it into bytes by hand. I am here (micro)optimizing for speed over space, wasting no more than three bytes of memory per operation.

  • Decoding uses String.prototype.indexOf to count the number of occurrences of the two abbreviated token types (z for 0x00000000 and y for 0x20202020). This requires us to crawl the input string twice, but should still be faster than manually looping over the characters just once, as we can take advantage of optimized native code (which I presume will use things like SIMD, possibly within standard library functions like memchr, to very rapidly scan the string for a single character).

class Ascii85Codec {
static CANONICAL_CHARSET = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu";
// Replaces all canonical characters that would be escaped when stored in
// Chrome's localStorage.
//
// " -> v
// < -> w
// \ -> x
//
static STORAGE_CHARSET = "!v#$%&'()*+,-./0123456789:;w=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[x]^_`abcdefghijklmnopqrstu";
static SPACE_RUN_CHAR = 'y';
static ZERO_RUN_CHAR = 'z';
constructor(charset) {
if (charset) {
if (charset.length != 85)
throw new Error("Invalid character set.");
} else {
charset = Ascii85Codec.CANONICAL_CHARSET;
}
this.charset = charset;
this.table = new Map(); // cache for faster decoding
for(let i = 0; i < charset.length; ++i) {
this.table.set(charset[i], i);
}
//
// Pre-construct a regex for faster validation of encoded inputs.
//
{
const CODE_a = ("a").charCodeAt(0);
const CODE_z = CODE_a + 25;
const CODE_A = ("A").charCodeAt(0);
const CODE_Z = CODE_A + 25;
const CODE_0 = ("0").charCodeAt(0);
const CODE_9 = CODE_0 + 9;
let character_class = "";
for(let i = 0; i < charset.length; ++i) {
let cc = charset.charCodeAt(i);
if (
(cc >= CODE_a && cc <= CODE_z) ||
(cc >= CODE_A && cc <= CODE_Z) ||
(cc >= CODE_0 && cc <= CODE_9)
) {
character_class += charset[i];
continue;
}
character_class += "\\x" + cc.toString(16).padStart(2, '0');
}
let abbreviation_class = Ascii85Codec.SPACE_RUN_CHAR + Ascii85Codec.ZERO_RUN_CHAR;
//
// First half: Allow any combination of single 'y' or 'z' markers, or
// groups of five encoding characters. Allow the string to end with
// fewer than five consecutive encoding markers.
//
let regex_src = `^(?:[${abbreviation_class}]|[${character_class}]{5})+[${character_class}]{0,4}$`;
this.validation_regex = new RegExp(regex_src);
}
}
/*String*/ encode(/*const DataView*/ view) /*const*/ {
const ZERO_RUN_CHAR = Ascii85Codec.ZERO_RUN_CHAR;
const SPACE_RUN_CHAR = Ascii85Codec.SPACE_RUN_CHAR;
let out = "";
let i;
for(i = 0; i + 3 < view.byteLength; i += 4) {
let dword = view.getUint32(i, false);
if (dword == 0) {
out += ZERO_RUN_CHAR;
continue;
}
if (dword == 0x20202020) {
out += SPACE_RUN_CHAR;
continue;
}
let chars = [" ", " ", " ", " ", " "];
for(let j = 0; j < 5; ++j) {
let unit = dword % 85;
chars[5 - j - 1] = this.charset[unit];
dword = (dword - unit) / 85;
}
out += chars.join("");
}
let rem = view.byteLength - i;
if (rem) {
//
// When the source data is not a multiple of four bytes, we must
// pad it. We can exclude this padding from the output string,
// though.
//
let dword = 0;
for(let j = 0; j < rem; ++j) {
let byte = view.getUint8(i + j);
dword = (dword << 8) | byte;
}
dword <<= (4 - rem) * 8;
if (dword < 0) { // JS bitwise operators produce int32_t results
dword += 0xFFFFFFFF + 1;
}
//
// Pack the padded dword.
//
let chars = [" ", " ", " ", " ", " "];
for(let j = 0; j < 5; ++j) {
let unit = dword % 85;
chars[5 - j - 1] = this.charset[unit];
dword = (dword - unit) / 85;
}
out += chars.join("").substring(0, rem + 1);
}
return out;
}
/*bool*/ validate(/*String*/ encoded) /*const*/ {
if (encoded === "")
return true; // an empty string is valid
return this.validation_regex.test(encoded);
}
//
// NOTE: The ArrayBuffer this allocates always has a size that is a multiple
// of four, with zero-padding as needed; but the DataView that this
// returns should be of exact length. I am here (micro)optimizing for
// speed, wasting at most three bytes of memory per operation.
//
/*DataView*/ decode(/*const String*/ str) /*const*/ {
const ZERO_RUN_CHAR = Ascii85Codec.ZERO_RUN_CHAR;
const SPACE_RUN_CHAR = Ascii85Codec.SPACE_RUN_CHAR;
let pad; // number of characters missing
let size;
{
let extras = 0;
//
// We need to account for abbreviated tokens, i.e. single characters that
// represent whole DWORDs.
//
let i = str.indexOf(ZERO_RUN_CHAR);
while (i >= 0) {
++extras;
i = str.indexOf(ZERO_RUN_CHAR, i + 1);
}
i = str.indexOf(SPACE_RUN_CHAR);
while (i >= 0) {
++extras;
i = str.indexOf(SPACE_RUN_CHAR, i + 1);
}
//
// We also need to account for missing characters (i.e. the number of chars
// by which we must pad (or simulate the padding of) the input string). The
// encoding design is such that every sequence of five characters (besides
// abbreviated tokens) represents four bytes; ergo the string length (minus
// abbreviated tokens) will be padded to a multiple of five.
//
pad = (str.length - extras) % 5;
if (pad)
pad = 5 - pad;
size = (str.length - extras + pad) * 0.8 + (extras * 4);
}
let buffer = new ArrayBuffer(size);
let view = new DataView(buffer);
let i;
let j = 0;
for(i = 0; i + 4 < str.length; j += 4) {
let c = str[i];
if (c == ZERO_RUN_CHAR) {
view.setUint32(j, 0, false);
++i;
continue;
}
if (c == SPACE_RUN_CHAR) {
view.setUint32(j, 0x20202020, false);
++i;
continue;
}
let dword = 0;
dword += this.table.get(c) * (85**4);
dword += this.table.get(str[i + 1]) * (85**3);
dword += this.table.get(str[i + 2]) * (85**2);
dword += this.table.get(str[i + 3]) * 85;
dword += this.table.get(str[i + 4]);
i += 5;
view.setUint32(j, dword, false);
}
if (pad) {
let chunk = str.substring(str.length - (5 - pad)).padEnd(5, this.charset[this.charset.length - 1]);
let dword = 0;
dword += this.table.get(chunk[0]) * (85**4);
dword += this.table.get(chunk[1]) * (85**3);
dword += this.table.get(chunk[2]) * (85**2);
dword += this.table.get(chunk[3]) * 85;
dword += this.table.get(chunk[4]);
view.setUint32(j, dword, false);
// Resize the output dataview to ignore the padding.
view = new DataView(buffer, 0, size - pad);
}
return view;
}
};
<!doctype html>
<html>
<head>
<title>Ascii85 testcases</title>
<script src="ascii85.js"></script>
<script>
{
let codec = new Ascii85Codec();
function print_buffer(name, view) {
let text = name + ":";
for(let i = 0; i < view.byteLength; ++i)
text += ' ' + view.getUint8(i).toString(16).toUpperCase().padStart(2, '0');
console.log(text);
}
function test(bytes) {
console.group();
try {
let buf = new ArrayBuffer(bytes.length);
let view = new DataView(buf);
for(let i = 0; i < bytes.length; ++i)
view.setUint8(i, bytes[i]);
let enc = codec.encode(view);
let dec = codec.decode(enc);
print_buffer("test input", view);
console.log(`encoded (${enc.length}): ${enc}`);
print_buffer("round-tripped", dec);
print_buffer("round-tripped (including scratch)", new DataView(dec.buffer));
if (dec.byteLength != view.byteLength) {
throw new Error("round-trip length changed");
}
for(let i = 0; i < bytes.length; ++i) {
if (dec.getUint8(i) != bytes[i])
throw new Error("round-trip content changed");
}
if (!codec.validate(enc)) {
throw new Error("encoded text did not validate");
}
} catch (e) {
console.groupEnd();
throw e;
}
console.groupEnd();
}
test([0x11, 0x22, 0x33, 0x44]);
test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88]);
test([0x11, 0x22, 0x33, 0x44, 0x55]);
test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66]);
test([0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77]);
test([0x11, 0x22, 0x33, 0x44, 0x00, 0x66, 0x77, 0x00, 0x99, 0xAA, 0xBB, 0xCC]);
test([0x11, 0x22, 0x33, 0x44, 0x00, 0x00, 0x00, 0x00, 0x99, 0xAA, 0xBB, 0xCC]);
test([0x11, 0x22, 0x33, 0x44, 0x20, 0x20, 0x20, 0x20, 0x99, 0xAA, 0xBB, 0xCC]);
function test_validation(str, desired) {
if (codec.validate(str) != desired) {
let error = desired ? "didn't validate when it should have" : "validated when it shouldn't have";
throw new Error(`string "${str}" ${error}`);
}
console.log(`${str} ${desired ? "validated" : "was rejected"}`);
}
test_validation("!!!!!", true);
test_validation("z!!!!!", true);
test_validation("!!!!!z", true);
test_validation("!!z!!", false);
test_validation("z!!!!", true);
}
</script>
</head>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment