Skip to content

Instantly share code, notes, and snippets.

@favila
Last active August 29, 2015 14:00
Show Gist options
  • Save favila/e156da820033ba1ee26e to your computer and use it in GitHub Desktop.
Save favila/e156da820033ba1ee26e to your computer and use it in GitHub Desktop.
OBSOLETE: see https://github.com/favila/utfate which is improved. Fast utf8 decoder in JS; faster than the TextDecoder polyfill at https://github.com/inexorabletash/text-encoding and about 66% slower than the native TextDecoder on Firefox. Benchmarks: http://jsperf.com/utf8-decoding
SFCCA = Function.prototype.apply.bind(String.fromCharCode, null);
SFCC = String.fromCharCode
/**
* Dispatch table for multi-byte UTF8 code points.
*
* Index of (utf8_prefix_byte - 0x80) is length of UTF8 byte
* sequence + 1. 0 means invalid prefix byte.
*
* @type {!Int8Array}
* @const
*/
MULTIBYTE_UTF8 = new Int8Array([
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0-C1 + C2-CF */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0-DF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* E0-EF */
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
]);
/**
* @param {number} prefix_byte
* @return {number}
*/
function UTF8_remaining_bytes(prefix_byte) {
return MULTIBYTE_UTF8[prefix_byte - 0x80] | 0;
}
/**
* @param {!Uint8Array} arr
* @param {number} start
* @param {number} end
* @return {string}
*/
function copy_ascii_block(arr, start, end) {
// invariants: end > start;
var CHUNKLEN = 16,
i = start,
out = '',
SFCCA = Function.prototype.apply.bind(String.fromCharCode, null);
while (len > CHUNKLEN) {
out += SFCCA(arr.subarray(i, CHUNKLEN));
i += CHUNKLEN;
len -= CHUNKLEN;
console.log(i, len, out);
}
if (len > 0) {
out += SFCCA(arr.subarray(i, len));
}
return out;
}
/**
* @param {string} errormsg
*/
function throwRangeError(errormsg) {
throw new RangeError(errormsg);
}
/**
* @param {!Uint8Array} in_buf
* @return {string}
*/
function decodeUtf8(in_buf) {
var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0, clen = 0,
end = in_buf.length,
errormsg = '',
ascii_start = 0,
out = '';
while (i < end) {
c = in_buf[i];
if (c < 0x80) {
ascii_start = ascii_start || i;
} else {
if (ascii_start < i) {
out += SFCCA(in_buf.subarray(ascii_start, i));
}
clen = UTF8_remaining_bytes(c);
ascii_start = i + clen + 1; // Assume next byte is ascii.
if (i + clen > end) {
errormsg = 'prefix byte wants ' + (clen + 1) + 'bytes but only have ' + (end - i) + ' at offset ' + i;
clen = -1; // Will throw error later.
}
switch(clen) {
case 3:
// Code point is > 0xFFFF; need surrogate pair in UTF-16.
// We know current byte is valid.
c1 = in_buf[++i];
c2 = in_buf[++i];
c3 = in_buf[++i];
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80 || (c3 & 0xC0) !== 0x80) {
errormsg = 'invalid continuation byte in byte sequence 0x'
+ c.toString(16) + c1.toString(16) + c2.toString(16) + c3.toString(16)
+ ' at offset ' + (i - clen);
}
out += String.fromCharCode(0xD800 | ((c & 0x07) << 6) | (c1 & 0x3F),
0xDC00 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
break;
case 2:
c1 = in_buf[++i];
c2 = in_buf[++i];
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80) {
errormsg = 'invalid continuation byte in byte sequence 0x'
+ c.toString(16) + c1.toString(16) + c2.toString(16)
+ ' at offset ' + (i - clen);
}
out += String.fromCharCode(((c & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F));
break;
case 1:
c1 = in_buf[++i];
if ((c1 & 0xC0) !== 0x80) {
errormsg = 'invalid continuation byte in byte sequence 0x'
+ c.toString(16) + c1.toString(16)
+ ' at offset ' + (i - clen);
}
out += String.fromCharCode(((c & 0x1F) << 6) | (c1 & 0x3F));
break;
case 0:
errormsg = 'prefix byte 0x' + c.toString(16) + ' at offset ' + i;
break;
}
if (errormsg) {
throwRangeError('Invalid UTF8: ' + errormsg);
}
}
i += 1;
}
if (ascii_start < end) {
out += SFCCA(in_buf.subarray(ascii_start, end));
}
return out;
}
function UTF8SeqLen(c) {
switch (c >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
return 1;
case 12: case 13:
return 2;
case 14:
return 3;
case 15:
// If 4th bit unset: 4; else 0;
return ~(c >> 1) & 4;
default:
return 0;
}
}
/**
* @param {number} index
* @param {number} bytes
* @param {number=} opt_needed
*/
function throwDecodeError(index, bytes, opt_needed) {
var errormsg = 'Invalid UTF8 byte(s) 0x' + bytes.toString(16) + ' at index ' + index;
if (opt_needed) {
errormsg += ': need ' + (opt_needed - 1) + ' more bytes.';
} else {
errormsg += '.';
}
throw new RangeError(errormsg);
}
/**
* @param {!Uint8Array} ib
* @return {string}
*/
function UTF8_inline(ib) {
var i = 0, ib_len = ib.length,
out = '',
ascii_start = 0, clen = 0,
c0 = 0, c1 = 0, c2 = 0, c3 = 0;
while (i < ib_len) {
c0 = ib[i];
if (c0 < 0x80) {
// Fast-path: note first seen ascii char and continue.
ascii_start = ascii_start || i;
++i;
continue;
}
if (ascii_start < i) {
out += SFCCA(ib.subarray(ascii_start, i));
}
switch (c0 >> 4) {
case 12: case 13:
clen = 2;
break;
case 14:
clen = 3;
break;
case 15:
// If 4th bit unset: 4; else 0;
clen = ~(c0 >> 1) & 4;
break;
default:
clen = 0;
break;
}
ascii_start = i + clen;
if (i + clen > ib_len) {
throwDecodeError(i, c0, clen);
}
switch (clen) {
case 0:
throwDecodeError(i, c0);
case 2:
c1 = ib[++i];
if ((c1 & 0xC0) !== 0x80) {
throwDecodeError(i - 1, ((c0 << 8) | c1));
}
out += SFCC(((c0 & 0x1F) << 6) | (c1 & 0x3F));
break;
case 3:
c1 = ib[++i];
c2 = ib[++i];
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80) {
throwDecodeError(i - 2, ((c0 << 16) | (c1 << 8) | c2));
}
out += SFCC(((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F));
break;
case 4:
c1 = ib[++i];
c2 = ib[++i];
c3 = ib[++i];
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80 || (c3 & 0xC0) !== 0x80) {
throwDecodeError(i - 3, ((c0 << 24) | (c1 << 16) | (c2 << 8) | c3));
}
out += SFCC(0xD800 | ((c0 & 0x07) << 6) | (c1 & 0x3F),
0xDC00 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
break;
}
++i;
}
if (ascii_start < ib_len) {
out += SFCCA(ib.subarray(ascii_start, ib_len));
}
return out;
}
/**
* @param {!Uint8Array} ib
* @return {string}
*/
function UTF8_simple(ib) {
var i = 0, ib_len = ib.length,
out = '',
c0 = 0, c1 = 0, c2 = 0, c3 = 0;
while (i < ib_len) {
c0 = ib[i];
switch (c0 >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
out += SFCC(c0);
break;
case 12: case 13:
if (i + 2 > ib_len) {
throwDecodeError(i, c0, 2);
}
clen = 2;
break;
case 14:
clen = 3;
break;
case 15:
// If 4th bit unset: 4; else 0;
clen = ~(c0 >> 1) & 4;
break;
default:
clen = 0;
break;
}
switch (clen) {
case 0:
throwDecodeError(i, c0);
case 2:
c1 = ib[++i];
if ((c1 & 0xC0) !== 0x80) {
throwDecodeError(i - 1, ((c0 << 8) | c1));
}
out += SFCC(((c0 & 0x1F) << 6) | (c1 & 0x3F));
break;
case 3:
c1 = ib[++i];
c2 = ib[++i];
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80) {
throwDecodeError(i - 2, ((c0 << 16) | (c1 << 8) | c2));
}
out += SFCC(((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F));
break;
case 4:
c1 = ib[++i];
c2 = ib[++i];
c3 = ib[++i];
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80 || (c3 & 0xC0) !== 0x80) {
throwDecodeError(i - 3, ((c0 << 24) | (c1 << 16) | (c2 << 8) | c3));
}
out += SFCC(0xD800 | ((c0 & 0x07) << 6) | (c1 & 0x3F),
0xDC00 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
break;
}
++i;
}
if (ascii_start < ib_len) {
out += SFCCA(ib.subarray(ascii_start, ib_len));
}
return out;
}
UTF8InvalidTailBits = [1,1,0,1];
UTF8InvalidOffset = [0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4];
UTF8OverlongMinimum = [0, 0x80, 0x800, 0x10000];
UTF8MagicSubtraction = [0, 0x00003080, 0x000E2080, 0x03C82080];
function inRange(c, lo, hi) { return ((c - lo) < (hi - lo + 1)); }
function isSurrogate(c) { return inRange(c, 0xd800, 0xdfff); }
function isNoncharacter(c) { return inRange(c, 0xfdd0, 0xfdef); }
function isReserved(c) { return ((c & 0xfffe) == 0xfffe); }
function isOutOfRange(c) { return (c > 0x10ffff); }
function UTF8TailLen(c) {
switch (c >> 4) {
case 12: case 13:
return 1;
case 14:
return 2;
case 15:
// If 4th bit unset: 3; else error;
return (c & 0x8) ? -1 : 3;
default:
return -1;
}
}
//BROKEN
// Test with http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt
// http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks/
/**
* @param {!Uint8Array} ib
* @param {number=} opt_start
* @param {number=} opt_end
* @return {string}
*/
function decodeUTF8_tricky(ib, opt_start, opt_end) {
var i = opt_start || 0, end = opt_end || ib.length,
ibi = 0, tail = 0,
ascii_start = i, c = 0, mask = 0,
out = '';
while (i < end) {
ibi = ib[i];
// Fast-path for ascii. Will decode in bulk.
if (ibi < 0x80) {
++i;
continue;
}
if (ascii_start < i) {
out += SFCCA(ib.subarray(ascii_start, i));
}
ascii_start = i;
tail = UTF8TailLen(ibi);
if (tail >= end - i) {
throwDecodeError(i, ibi, tail);
}
mask = 0;
switch (tail) {
case 3:
c = (c + ibi) << 6;
ibi = ib[++i];
// same as (mask << 1) | UTF8InvalidTailBits[ibi >> 6];
mask = (mask << 1) | ((0xb >> (ibi >> 6)) & 1);
case 2:
c = (c + ibi) << 6;
ibi = ib[++i];
mask = (mask << 1) | ((0xb >> (ibi >> 6)) & 1);
case 1:
c = (c + ibi) << 6;
ibi = ib[++i];
mask = (mask << 1) | ((0xb >> (ibi >> 6)) & 1);
case 0:
c += ibi;
++i;
break;
case -1:
throwDecodeError(i, c);
break;
}
if (mask) {
throwDecodeError(i - UTF8InvalidOffset[mask], ib[i - UTF8InvalidOffset[mask]]);
}
c -= UTF8MagicSubtraction[tail];
if (c < UTF8OverlongMinimum[tail] || isSurrogate(c) || isNoncharacter(c) ||
isReserved(c) || isOutOfRange(c)) {
throwDecodeError(i - tail, ib[i - tail]);
}
if (tail === 3) {
c -= 0x10000;
out += SFCC(0xD800 | (c >> 10), 0xDC00 | (c & 0x3FF));
} else {
out += SFCC(c);
}
}
if (ascii_start < i) {
out += SFCCA(ib.subarray(ascii_start, i));
}
return out;
}
/**
* @param {string} errormsg
*/
function throwRangeError(errormsg) {
throw new RangeError(errormsg);
}
/**@enum {number} */
ENCODINGERRORS = {
NONE: 0,
ORPHAN_LSUR: 1,
ORPHAN_TSUR: 2,
OUTBUF_END: 3
}
/**
* @param {string} string
* @param {!Uint8Array} out
* @param {number=} opt_startchar
* @param {number=} opt_endchar
* @return {!{chars:number,bytes:number,error:ENCODINGERRORS}} chars and bytes written
*/
function encodeUtf8(string, out, opt_startchar, opt_endchar) {
var i = opt_startchar || 0, c = 0, c1 = 0, end = opt_endchar || string.length,
bytei = 0, byteend = out.length, bytesneeded = 1, extracharneeded = 0,
CCA = String.prototype.charCodeAt.bind(string);
while (i < end) {
c = CCA(i);
switch (c & 0xFC00) {
case 0xDC00:
return {chars: i, bytes: bytei, error: ENCODINGERRORS.ORPHAN_TSUR};
case 0xD800:
// Surrogate pair: 4 UTF8 bytes
c1 = CCA(++i);
if ((c1 & 0xFC00) !== 0xDC00) {
return {chars: i - 1, bytes: bytei, error: ENCODINGERRORS.ORPHAN_LSUR};
}
extracharneeded = 1;
bytesneeded = 4;
out[bytei++] = 0xF0 | ((c >>> 7) & 0x07);
out[bytei++] = 0x80 | ((c >>> 1) & 0x3F);
out[bytei++] = 0x80 | ((c & 1) << 5) | ((c1 >>> 5) & 0x1F);
out[bytei++] = 0x80 | (c1 & 0x03F);
break;
default:
extracharneeded = 0;
if (c < 0x80) { // ASCII: 1 UTF8 byte.
bytesneeded = 1;
out[bytei++] = c;
} else {
if (c < 0x800) { // 2 UTF8 bytes.
bytesneeded = 2;
out[bytei++] = 0xC0 | (c >>> 6);
} else { // 3 UTF8 bytes.
bytesneeded = 3;
out[bytei++] = 0xE0 | ((c >>> 12) & 0x0F);
out[bytei++] = 0x80 | ((c >>> 6) & 0x3F);
}
out[bytei++] = 0x80 | (c & 0x3F);
}
break;
}
if (bytei > byteend) {
return {chars: i - extracharneeded, bytes: bytei - bytesneeded, error: ENCODINGERRORS.OUTBUF_END};
}
++i;
}
return {chars: i, bytes: bytei, error: ENCODINGERRORS.NONE};
}
/**
* @param {number} utf16_code_point
* @return {number} utf8 bytes needed, or 0 if trailing surrogate.
*/
function utf8_bytes_needed(utf16_code_point) {
if (utf16_code_point < 0x80) {
return 1;
}
if (utf16_code_point < 0x800) {
return 2;
}
if (utf16_code_point < 0xD800) {
return 3;
}
if (utf16_code_point < 0xDC00) {
return 4;
}
if (utf16_code_point < 0xE000) {
return 0;
}
return 3;
}
//WORKING!!
/**
* @param {string} string
* @param {!Uint8Array} out
* @param {number=} opt_startchar
* @param {number=} opt_endchar
* @param {number=} opt_startbyte
* @param {number=} opt_endbyte
* @return {!{chars:number,bytes:number,error:ENCODINGERRORS}} chars and bytes written
*/
function encodeUtf8_2(string, out, opt_startchar, opt_endchar, opt_startbyte, opt_endbyte) {
var i = opt_startchar || 0,
end = opt_endchar || string.length,
bytei = opt_startbyte || 0,
byteend = opt_endbyte || out.length,
bytesneeded = 0,
c = 0, c1 = 0,
CCA = String.prototype.charCodeAt.bind(string);
while (i < end) {
c = CCA(i);
if (c < 0x80) {
// Ascii fast-path
if (byteend <= bytei) {
return {chars: i, bytes: bytei, error: ENCODINGERRORS.OUTBUF_END};
}
out[bytei++] = c;
++i;
continue;
} else if (c < 0x800) {
bytesneeded = 2;
} else if (c < 0xD800) {
bytesneeded = 3;
} else if (c < 0xDC00) {
bytesneeded = 4;
} else if (c < 0xE000) {
bytesneeded = 0;
} else {
bytesneeded = 3;
}
bytei += bytesneeded;
if (bytei > byteend) {
return {chars: i, bytes: bytei - bytesneeded, error: ENCODINGERRORS.OUTBUF_END};
}
switch (bytesneeded) {
case 4:
// Convert to UTF32 because we need to add 0x10000 to the code point.
c1 = CCA(++i);
if ((c1 & 0xFC00) !== 0xDC00) {
return {chars: i - 1, bytes: bytei - bytesneeded, error: ENCODINGERRORS.ORPHAN_LSUR};
}
c = (((c & 0x3FF) << 10) | (c1 & 0x3FF)) + 0x10000;
out[--bytei] = 0x80 | (c & 0x3F);
c >>= 6;
case 3:
out[--bytei] = 0x80 | (c & 0x3F);
c >>= 6;
case 2:
out[--bytei] = 0x80 | (c & 0x3F);
c >>= 6;
case 1:
// This does *not* cover ascii case because the shift is wrong!
// Uint8Array will mask for us.
out[--bytei] = (0xF00 >> bytesneeded) | c;
bytei += bytesneeded;
++i;
break;
case 0:
return {chars: i, bytes: bytei, error: ENCODINGERRORS.ORPHAN_TSUR};
}
}
return {chars: i, bytes: bytei, error: ENCODINGERRORS.NONE};
}
function teststr() {
var out = '';
for (var i = 0x20; i < 0x10ffff; i = i + 64) {
out += String.fromCharCode(i);
}
return out;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment