Last active
September 9, 2023 05:21
-
-
Save pascaldekloe/62546103a1576803dade9269ccf76330 to your computer and use it in GitHub Desktop.
JavaScript UTF-8 encoding and decoding with TypedArray
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is free and unencumbered software released into the public domain. | |
// Marshals a string to an Uint8Array. | |
function encodeUTF8(s) { | |
var i = 0, bytes = new Uint8Array(s.length * 4); | |
for (var ci = 0; ci != s.length; ci++) { | |
var c = s.charCodeAt(ci); | |
if (c < 128) { | |
bytes[i++] = c; | |
continue; | |
} | |
if (c < 2048) { | |
bytes[i++] = c >> 6 | 192; | |
} else { | |
if (c > 0xd7ff && c < 0xdc00) { | |
if (++ci >= s.length) | |
throw new Error('UTF-8 encode: incomplete surrogate pair'); | |
var c2 = s.charCodeAt(ci); | |
if (c2 < 0xdc00 || c2 > 0xdfff) | |
throw new Error('UTF-8 encode: second surrogate character 0x' + c2.toString(16) + ' at index ' + ci + ' out of range'); | |
c = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff); | |
bytes[i++] = c >> 18 | 240; | |
bytes[i++] = c >> 12 & 63 | 128; | |
} else bytes[i++] = c >> 12 | 224; | |
bytes[i++] = c >> 6 & 63 | 128; | |
} | |
bytes[i++] = c & 63 | 128; | |
} | |
return bytes.subarray(0, i); | |
} | |
// Unmarshals a string from an Uint8Array. | |
function decodeUTF8(bytes) { | |
var i = 0, s = ''; | |
while (i < bytes.length) { | |
var c = bytes[i++]; | |
if (c > 127) { | |
if (c > 191 && c < 224) { | |
if (i >= bytes.length) | |
throw new Error('UTF-8 decode: incomplete 2-byte sequence'); | |
c = (c & 31) << 6 | bytes[i++] & 63; | |
} else if (c > 223 && c < 240) { | |
if (i + 1 >= bytes.length) | |
throw new Error('UTF-8 decode: incomplete 3-byte sequence'); | |
c = (c & 15) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63; | |
} else if (c > 239 && c < 248) { | |
if (i + 2 >= bytes.length) | |
throw new Error('UTF-8 decode: incomplete 4-byte sequence'); | |
c = (c & 7) << 18 | (bytes[i++] & 63) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63; | |
} else throw new Error('UTF-8 decode: unknown multibyte start 0x' + c.toString(16) + ' at index ' + (i - 1)); | |
} | |
if (c <= 0xffff) s += String.fromCharCode(c); | |
else if (c <= 0x10ffff) { | |
c -= 0x10000; | |
s += String.fromCharCode(c >> 10 | 0xd800) | |
s += String.fromCharCode(c & 0x3FF | 0xdc00) | |
} else throw new Error('UTF-8 decode: code point 0x' + c.toString(16) + ' exceeds UTF-16 reach'); | |
} | |
return s; | |
} |
Looks great, is this under any specific licence?
You are most welcome @davidkhess.
Sorry @jameswilddev, for some reason GitHub didn't notify on your message. The code is public domain so feel free to do with it as you please. :-)
Just helped me a lot to solve an issue.
I'll post this link under my solution.
https://stackoverflow.com/questions/53929108/how-to-convert-a-javascript-object-to-utf-8-blob-for-download
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice! Thanks for making this available.