Last active
August 29, 2015 14:00
-
-
Save favila/e156da820033ba1ee26e to your computer and use it in GitHub Desktop.
OBSOLETE: see https://github.com/favila/utfate which is improved. Fast utf8 decoder in JS; faster than the TextDecoder polyfill at https://github.com/inexorabletash/text-encoding and about 66% slower than the native TextDecoder on Firefox. Benchmarks: http://jsperf.com/utf8-decoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SFCCA = Function.prototype.apply.bind(String.fromCharCode, null); | |
SFCC = String.fromCharCode | |
/** | |
* Dispatch table for multi-byte UTF8 code points. | |
* | |
* Index of (utf8_prefix_byte - 0x80) is length of UTF8 byte | |
* sequence + 1. 0 means invalid prefix byte. | |
* | |
* @type {!Int8Array} | |
* @const | |
*/ | |
MULTIBYTE_UTF8 = new Int8Array([ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ | |
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0-C1 + C2-CF */ | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0-DF */ | |
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* E0-EF */ | |
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ | |
]); | |
/** | |
* @param {number} prefix_byte | |
* @return {number} | |
*/ | |
function UTF8_remaining_bytes(prefix_byte) { | |
return MULTIBYTE_UTF8[prefix_byte - 0x80] | 0; | |
} | |
/** | |
* @param {!Uint8Array} arr | |
* @param {number} start | |
* @param {number} end | |
* @return {string} | |
*/ | |
function copy_ascii_block(arr, start, end) { | |
// invariants: end > start; | |
var CHUNKLEN = 16, | |
i = start, | |
out = '', | |
SFCCA = Function.prototype.apply.bind(String.fromCharCode, null); | |
while (len > CHUNKLEN) { | |
out += SFCCA(arr.subarray(i, CHUNKLEN)); | |
i += CHUNKLEN; | |
len -= CHUNKLEN; | |
console.log(i, len, out); | |
} | |
if (len > 0) { | |
out += SFCCA(arr.subarray(i, len)); | |
} | |
return out; | |
} | |
/** | |
* @param {string} errormsg | |
*/ | |
function throwRangeError(errormsg) { | |
throw new RangeError(errormsg); | |
} | |
/** | |
* @param {!Uint8Array} in_buf | |
* @return {string} | |
*/ | |
function decodeUtf8(in_buf) { | |
var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0, clen = 0, | |
end = in_buf.length, | |
errormsg = '', | |
ascii_start = 0, | |
out = ''; | |
while (i < end) { | |
c = in_buf[i]; | |
if (c < 0x80) { | |
ascii_start = ascii_start || i; | |
} else { | |
if (ascii_start < i) { | |
out += SFCCA(in_buf.subarray(ascii_start, i)); | |
} | |
clen = UTF8_remaining_bytes(c); | |
ascii_start = i + clen + 1; // Assume next byte is ascii. | |
if (i + clen > end) { | |
errormsg = 'prefix byte wants ' + (clen + 1) + 'bytes but only have ' + (end - i) + ' at offset ' + i; | |
clen = -1; // Will throw error later. | |
} | |
switch(clen) { | |
case 3: | |
// Code point is > 0xFFFF; need surrogate pair in UTF-16. | |
// We know current byte is valid. | |
c1 = in_buf[++i]; | |
c2 = in_buf[++i]; | |
c3 = in_buf[++i]; | |
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80 || (c3 & 0xC0) !== 0x80) { | |
errormsg = 'invalid continuation byte in byte sequence 0x' | |
+ c.toString(16) + c1.toString(16) + c2.toString(16) + c3.toString(16) | |
+ ' at offset ' + (i - clen); | |
} | |
out += String.fromCharCode(0xD800 | ((c & 0x07) << 6) | (c1 & 0x3F), | |
0xDC00 | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); | |
break; | |
case 2: | |
c1 = in_buf[++i]; | |
c2 = in_buf[++i]; | |
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80) { | |
errormsg = 'invalid continuation byte in byte sequence 0x' | |
+ c.toString(16) + c1.toString(16) + c2.toString(16) | |
+ ' at offset ' + (i - clen); | |
} | |
out += String.fromCharCode(((c & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F)); | |
break; | |
case 1: | |
c1 = in_buf[++i]; | |
if ((c1 & 0xC0) !== 0x80) { | |
errormsg = 'invalid continuation byte in byte sequence 0x' | |
+ c.toString(16) + c1.toString(16) | |
+ ' at offset ' + (i - clen); | |
} | |
out += String.fromCharCode(((c & 0x1F) << 6) | (c1 & 0x3F)); | |
break; | |
case 0: | |
errormsg = 'prefix byte 0x' + c.toString(16) + ' at offset ' + i; | |
break; | |
} | |
if (errormsg) { | |
throwRangeError('Invalid UTF8: ' + errormsg); | |
} | |
} | |
i += 1; | |
} | |
if (ascii_start < end) { | |
out += SFCCA(in_buf.subarray(ascii_start, end)); | |
} | |
return out; | |
} | |
function UTF8SeqLen(c) { | |
switch (c >> 4) { | |
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: | |
return 1; | |
case 12: case 13: | |
return 2; | |
case 14: | |
return 3; | |
case 15: | |
// If 4th bit unset: 4; else 0; | |
return ~(c >> 1) & 4; | |
default: | |
return 0; | |
} | |
} | |
/** | |
* @param {number} index | |
* @param {number} bytes | |
* @param {number=} opt_needed | |
*/ | |
function throwDecodeError(index, bytes, opt_needed) { | |
var errormsg = 'Invalid UTF8 byte(s) 0x' + bytes.toString(16) + ' at index ' + index; | |
if (opt_needed) { | |
errormsg += ': need ' + (opt_needed - 1) + ' more bytes.'; | |
} else { | |
errormsg += '.'; | |
} | |
throw new RangeError(errormsg); | |
} | |
/** | |
* @param {!Uint8Array} ib | |
* @return {string} | |
*/ | |
function UTF8_inline(ib) { | |
var i = 0, ib_len = ib.length, | |
out = '', | |
ascii_start = 0, clen = 0, | |
c0 = 0, c1 = 0, c2 = 0, c3 = 0; | |
while (i < ib_len) { | |
c0 = ib[i]; | |
if (c0 < 0x80) { | |
// Fast-path: note first seen ascii char and continue. | |
ascii_start = ascii_start || i; | |
++i; | |
continue; | |
} | |
if (ascii_start < i) { | |
out += SFCCA(ib.subarray(ascii_start, i)); | |
} | |
switch (c0 >> 4) { | |
case 12: case 13: | |
clen = 2; | |
break; | |
case 14: | |
clen = 3; | |
break; | |
case 15: | |
// If 4th bit unset: 4; else 0; | |
clen = ~(c0 >> 1) & 4; | |
break; | |
default: | |
clen = 0; | |
break; | |
} | |
ascii_start = i + clen; | |
if (i + clen > ib_len) { | |
throwDecodeError(i, c0, clen); | |
} | |
switch (clen) { | |
case 0: | |
throwDecodeError(i, c0); | |
case 2: | |
c1 = ib[++i]; | |
if ((c1 & 0xC0) !== 0x80) { | |
throwDecodeError(i - 1, ((c0 << 8) | c1)); | |
} | |
out += SFCC(((c0 & 0x1F) << 6) | (c1 & 0x3F)); | |
break; | |
case 3: | |
c1 = ib[++i]; | |
c2 = ib[++i]; | |
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80) { | |
throwDecodeError(i - 2, ((c0 << 16) | (c1 << 8) | c2)); | |
} | |
out += SFCC(((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F)); | |
break; | |
case 4: | |
c1 = ib[++i]; | |
c2 = ib[++i]; | |
c3 = ib[++i]; | |
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80 || (c3 & 0xC0) !== 0x80) { | |
throwDecodeError(i - 3, ((c0 << 24) | (c1 << 16) | (c2 << 8) | c3)); | |
} | |
out += SFCC(0xD800 | ((c0 & 0x07) << 6) | (c1 & 0x3F), | |
0xDC00 | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); | |
break; | |
} | |
++i; | |
} | |
if (ascii_start < ib_len) { | |
out += SFCCA(ib.subarray(ascii_start, ib_len)); | |
} | |
return out; | |
} | |
/** | |
* @param {!Uint8Array} ib | |
* @return {string} | |
*/ | |
function UTF8_simple(ib) { | |
var i = 0, ib_len = ib.length, | |
out = '', | |
c0 = 0, c1 = 0, c2 = 0, c3 = 0; | |
while (i < ib_len) { | |
c0 = ib[i]; | |
switch (c0 >> 4) { | |
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: | |
out += SFCC(c0); | |
break; | |
case 12: case 13: | |
if (i + 2 > ib_len) { | |
throwDecodeError(i, c0, 2); | |
} | |
clen = 2; | |
break; | |
case 14: | |
clen = 3; | |
break; | |
case 15: | |
// If 4th bit unset: 4; else 0; | |
clen = ~(c0 >> 1) & 4; | |
break; | |
default: | |
clen = 0; | |
break; | |
} | |
switch (clen) { | |
case 0: | |
throwDecodeError(i, c0); | |
case 2: | |
c1 = ib[++i]; | |
if ((c1 & 0xC0) !== 0x80) { | |
throwDecodeError(i - 1, ((c0 << 8) | c1)); | |
} | |
out += SFCC(((c0 & 0x1F) << 6) | (c1 & 0x3F)); | |
break; | |
case 3: | |
c1 = ib[++i]; | |
c2 = ib[++i]; | |
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80) { | |
throwDecodeError(i - 2, ((c0 << 16) | (c1 << 8) | c2)); | |
} | |
out += SFCC(((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F)); | |
break; | |
case 4: | |
c1 = ib[++i]; | |
c2 = ib[++i]; | |
c3 = ib[++i]; | |
if ((c1 & 0xC0) !== 0x80 || (c2 & 0xC0) !== 0x80 || (c3 & 0xC0) !== 0x80) { | |
throwDecodeError(i - 3, ((c0 << 24) | (c1 << 16) | (c2 << 8) | c3)); | |
} | |
out += SFCC(0xD800 | ((c0 & 0x07) << 6) | (c1 & 0x3F), | |
0xDC00 | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); | |
break; | |
} | |
++i; | |
} | |
if (ascii_start < ib_len) { | |
out += SFCCA(ib.subarray(ascii_start, ib_len)); | |
} | |
return out; | |
} | |
UTF8InvalidTailBits = [1,1,0,1]; | |
UTF8InvalidOffset = [0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4]; | |
UTF8OverlongMinimum = [0, 0x80, 0x800, 0x10000]; | |
UTF8MagicSubtraction = [0, 0x00003080, 0x000E2080, 0x03C82080]; | |
function inRange(c, lo, hi) { return ((c - lo) < (hi - lo + 1)); } | |
function isSurrogate(c) { return inRange(c, 0xd800, 0xdfff); } | |
function isNoncharacter(c) { return inRange(c, 0xfdd0, 0xfdef); } | |
function isReserved(c) { return ((c & 0xfffe) == 0xfffe); } | |
function isOutOfRange(c) { return (c > 0x10ffff); } | |
function UTF8TailLen(c) { | |
switch (c >> 4) { | |
case 12: case 13: | |
return 1; | |
case 14: | |
return 2; | |
case 15: | |
// If 4th bit unset: 3; else error; | |
return (c & 0x8) ? -1 : 3; | |
default: | |
return -1; | |
} | |
} | |
//BROKEN | |
// Test with http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt | |
// http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks/ | |
/** | |
* @param {!Uint8Array} ib | |
* @param {number=} opt_start | |
* @param {number=} opt_end | |
* @return {string} | |
*/ | |
function decodeUTF8_tricky(ib, opt_start, opt_end) { | |
var i = opt_start || 0, end = opt_end || ib.length, | |
ibi = 0, tail = 0, | |
ascii_start = i, c = 0, mask = 0, | |
out = ''; | |
while (i < end) { | |
ibi = ib[i]; | |
// Fast-path for ascii. Will decode in bulk. | |
if (ibi < 0x80) { | |
++i; | |
continue; | |
} | |
if (ascii_start < i) { | |
out += SFCCA(ib.subarray(ascii_start, i)); | |
} | |
ascii_start = i; | |
tail = UTF8TailLen(ibi); | |
if (tail >= end - i) { | |
throwDecodeError(i, ibi, tail); | |
} | |
mask = 0; | |
switch (tail) { | |
case 3: | |
c = (c + ibi) << 6; | |
ibi = ib[++i]; | |
// same as (mask << 1) | UTF8InvalidTailBits[ibi >> 6]; | |
mask = (mask << 1) | ((0xb >> (ibi >> 6)) & 1); | |
case 2: | |
c = (c + ibi) << 6; | |
ibi = ib[++i]; | |
mask = (mask << 1) | ((0xb >> (ibi >> 6)) & 1); | |
case 1: | |
c = (c + ibi) << 6; | |
ibi = ib[++i]; | |
mask = (mask << 1) | ((0xb >> (ibi >> 6)) & 1); | |
case 0: | |
c += ibi; | |
++i; | |
break; | |
case -1: | |
throwDecodeError(i, c); | |
break; | |
} | |
if (mask) { | |
throwDecodeError(i - UTF8InvalidOffset[mask], ib[i - UTF8InvalidOffset[mask]]); | |
} | |
c -= UTF8MagicSubtraction[tail]; | |
if (c < UTF8OverlongMinimum[tail] || isSurrogate(c) || isNoncharacter(c) || | |
isReserved(c) || isOutOfRange(c)) { | |
throwDecodeError(i - tail, ib[i - tail]); | |
} | |
if (tail === 3) { | |
c -= 0x10000; | |
out += SFCC(0xD800 | (c >> 10), 0xDC00 | (c & 0x3FF)); | |
} else { | |
out += SFCC(c); | |
} | |
} | |
if (ascii_start < i) { | |
out += SFCCA(ib.subarray(ascii_start, i)); | |
} | |
return out; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @param {string} errormsg | |
*/ | |
function throwRangeError(errormsg) { | |
throw new RangeError(errormsg); | |
} | |
/**@enum {number} */ | |
ENCODINGERRORS = { | |
NONE: 0, | |
ORPHAN_LSUR: 1, | |
ORPHAN_TSUR: 2, | |
OUTBUF_END: 3 | |
} | |
/** | |
* @param {string} string | |
* @param {!Uint8Array} out | |
* @param {number=} opt_startchar | |
* @param {number=} opt_endchar | |
* @return {!{chars:number,bytes:number,error:ENCODINGERRORS}} chars and bytes written | |
*/ | |
function encodeUtf8(string, out, opt_startchar, opt_endchar) { | |
var i = opt_startchar || 0, c = 0, c1 = 0, end = opt_endchar || string.length, | |
bytei = 0, byteend = out.length, bytesneeded = 1, extracharneeded = 0, | |
CCA = String.prototype.charCodeAt.bind(string); | |
while (i < end) { | |
c = CCA(i); | |
switch (c & 0xFC00) { | |
case 0xDC00: | |
return {chars: i, bytes: bytei, error: ENCODINGERRORS.ORPHAN_TSUR}; | |
case 0xD800: | |
// Surrogate pair: 4 UTF8 bytes | |
c1 = CCA(++i); | |
if ((c1 & 0xFC00) !== 0xDC00) { | |
return {chars: i - 1, bytes: bytei, error: ENCODINGERRORS.ORPHAN_LSUR}; | |
} | |
extracharneeded = 1; | |
bytesneeded = 4; | |
out[bytei++] = 0xF0 | ((c >>> 7) & 0x07); | |
out[bytei++] = 0x80 | ((c >>> 1) & 0x3F); | |
out[bytei++] = 0x80 | ((c & 1) << 5) | ((c1 >>> 5) & 0x1F); | |
out[bytei++] = 0x80 | (c1 & 0x03F); | |
break; | |
default: | |
extracharneeded = 0; | |
if (c < 0x80) { // ASCII: 1 UTF8 byte. | |
bytesneeded = 1; | |
out[bytei++] = c; | |
} else { | |
if (c < 0x800) { // 2 UTF8 bytes. | |
bytesneeded = 2; | |
out[bytei++] = 0xC0 | (c >>> 6); | |
} else { // 3 UTF8 bytes. | |
bytesneeded = 3; | |
out[bytei++] = 0xE0 | ((c >>> 12) & 0x0F); | |
out[bytei++] = 0x80 | ((c >>> 6) & 0x3F); | |
} | |
out[bytei++] = 0x80 | (c & 0x3F); | |
} | |
break; | |
} | |
if (bytei > byteend) { | |
return {chars: i - extracharneeded, bytes: bytei - bytesneeded, error: ENCODINGERRORS.OUTBUF_END}; | |
} | |
++i; | |
} | |
return {chars: i, bytes: bytei, error: ENCODINGERRORS.NONE}; | |
} | |
/** | |
* @param {number} utf16_code_point | |
* @return {number} utf8 bytes needed, or 0 if trailing surrogate. | |
*/ | |
function utf8_bytes_needed(utf16_code_point) { | |
if (utf16_code_point < 0x80) { | |
return 1; | |
} | |
if (utf16_code_point < 0x800) { | |
return 2; | |
} | |
if (utf16_code_point < 0xD800) { | |
return 3; | |
} | |
if (utf16_code_point < 0xDC00) { | |
return 4; | |
} | |
if (utf16_code_point < 0xE000) { | |
return 0; | |
} | |
return 3; | |
} | |
//WORKING!! | |
/** | |
* @param {string} string | |
* @param {!Uint8Array} out | |
* @param {number=} opt_startchar | |
* @param {number=} opt_endchar | |
* @param {number=} opt_startbyte | |
* @param {number=} opt_endbyte | |
* @return {!{chars:number,bytes:number,error:ENCODINGERRORS}} chars and bytes written | |
*/ | |
function encodeUtf8_2(string, out, opt_startchar, opt_endchar, opt_startbyte, opt_endbyte) { | |
var i = opt_startchar || 0, | |
end = opt_endchar || string.length, | |
bytei = opt_startbyte || 0, | |
byteend = opt_endbyte || out.length, | |
bytesneeded = 0, | |
c = 0, c1 = 0, | |
CCA = String.prototype.charCodeAt.bind(string); | |
while (i < end) { | |
c = CCA(i); | |
if (c < 0x80) { | |
// Ascii fast-path | |
if (byteend <= bytei) { | |
return {chars: i, bytes: bytei, error: ENCODINGERRORS.OUTBUF_END}; | |
} | |
out[bytei++] = c; | |
++i; | |
continue; | |
} else if (c < 0x800) { | |
bytesneeded = 2; | |
} else if (c < 0xD800) { | |
bytesneeded = 3; | |
} else if (c < 0xDC00) { | |
bytesneeded = 4; | |
} else if (c < 0xE000) { | |
bytesneeded = 0; | |
} else { | |
bytesneeded = 3; | |
} | |
bytei += bytesneeded; | |
if (bytei > byteend) { | |
return {chars: i, bytes: bytei - bytesneeded, error: ENCODINGERRORS.OUTBUF_END}; | |
} | |
switch (bytesneeded) { | |
case 4: | |
// Convert to UTF32 because we need to add 0x10000 to the code point. | |
c1 = CCA(++i); | |
if ((c1 & 0xFC00) !== 0xDC00) { | |
return {chars: i - 1, bytes: bytei - bytesneeded, error: ENCODINGERRORS.ORPHAN_LSUR}; | |
} | |
c = (((c & 0x3FF) << 10) | (c1 & 0x3FF)) + 0x10000; | |
out[--bytei] = 0x80 | (c & 0x3F); | |
c >>= 6; | |
case 3: | |
out[--bytei] = 0x80 | (c & 0x3F); | |
c >>= 6; | |
case 2: | |
out[--bytei] = 0x80 | (c & 0x3F); | |
c >>= 6; | |
case 1: | |
// This does *not* cover ascii case because the shift is wrong! | |
// Uint8Array will mask for us. | |
out[--bytei] = (0xF00 >> bytesneeded) | c; | |
bytei += bytesneeded; | |
++i; | |
break; | |
case 0: | |
return {chars: i, bytes: bytei, error: ENCODINGERRORS.ORPHAN_TSUR}; | |
} | |
} | |
return {chars: i, bytes: bytei, error: ENCODINGERRORS.NONE}; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function teststr() { | |
var out = ''; | |
for (var i = 0x20; i < 0x10ffff; i = i + 64) { | |
out += String.fromCharCode(i); | |
} | |
return out; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment