-
-
Save joni/3760795 to your computer and use it in GitHub Desktop.
function toUTF8Array(str) { | |
var utf8 = []; | |
for (var i=0; i < str.length; i++) { | |
var charcode = str.charCodeAt(i); | |
if (charcode < 0x80) utf8.push(charcode); | |
else if (charcode < 0x800) { | |
utf8.push(0xc0 | (charcode >> 6), | |
0x80 | (charcode & 0x3f)); | |
} | |
else if (charcode < 0xd800 || charcode >= 0xe000) { | |
utf8.push(0xe0 | (charcode >> 12), | |
0x80 | ((charcode>>6) & 0x3f), | |
0x80 | (charcode & 0x3f)); | |
} | |
// surrogate pair | |
else { | |
i++; | |
// UTF-16 encodes 0x10000-0x10FFFF by | |
// subtracting 0x10000 and splitting the | |
// 20 bits of 0x0-0xFFFFF into two halves | |
charcode = 0x10000 + (((charcode & 0x3ff)<<10) | |
| (str.charCodeAt(i) & 0x3ff)) | |
utf8.push(0xf0 | (charcode >>18), | |
0x80 | ((charcode>>12) & 0x3f), | |
0x80 | ((charcode>>6) & 0x3f), | |
0x80 | (charcode & 0x3f)); | |
} | |
} | |
return utf8; | |
} |
Maybe a little late, but if someone find it useful, here's the solution to decode a UTF-8 array:
function fromUTF8Array(data) { // array of bytes
var str = '',
i;
for (i = 0; i < data.length; i++) {
var value = data[i];
if (value < 0x80) {
str += String.fromCharCode(value);
} else if (value > 0xBF && value < 0xE0) {
str += String.fromCharCode((value & 0x1F) << 6 | data[i + 1] & 0x3F);
i += 1;
} else if (value > 0xDF && value < 0xF0) {
str += String.fromCharCode((value & 0x0F) << 12 | (data[i + 1] & 0x3F) << 6 | data[i + 2] & 0x3F);
i += 2;
} else {
// surrogate pair
var charCode = ((value & 0x07) << 18 | (data[i + 1] & 0x3F) << 12 | (data[i + 2] & 0x3F) << 6 | data[i + 3] & 0x3F) - 0x010000;
str += String.fromCharCode(charCode >> 10 | 0xD800, charCode & 0x03FF | 0xDC00);
i += 3;
}
}
return str;
}
Simple decode UTF-8 array in JavaScript:
function fromUTF8Array($) {
return eval("String.fromCharCode(" + $ + ")");
}
alert(fromUTF8Array("119,119,119,46,87,72,65,75,46,99,111,109"));
@frozn How to use fromUTF8Array for array of bytes like [0,1,0,-1] ??
there's "something" wrong with the surrogate pair portion of your code.
I think
charcode = 0x10000 + (((charcode & 0x3ff)<<10)
| (str.charCodeAt(i) & 0x3ff))
should be
charcode = 0x10000 + (((charcode & 0x3ff)<<10)
| (str.charCodeAt(i+1) & 0x3ff))
All I know is that you never add in the following pair and increment the index accordingly
This worked great when creating a .doc file from Blob. For some reason, my old method using charCodeAt(i) stopped working, showing some weir characters when opening the file. This is how my method works:
var htmlString = '<div>Your html á é í ó ú</div>';
var arrayUTF8 = toUTF8Array(htmlString); //Your function
var byteNumbers = new Uint8Array(arrayUTF8.length);
for (var i = 0; i < arrayUTF8.length; i++) {
byteNumbers[i] = arrayUTF8[i];
}
var blob = new Blob([byteNumbers], {type: 'text/html;charset=UTF-8;' });
FileSaver.saveAs(blob, 'yourfile.doc');
@bkdotcom looks like there should not be that extra i+1
in str.charCodeAt
, because there is i++;
right after else. WDYT?
What I don't quite understand is
} else if (charcode < 0x800) {
// ...
} else if (charcode < 0xd800 || charcode >= 0xe000) {
// ^ never true given previous if
**UPD:** Here is a similar function inside google closure library: [stringToUtf8ByteArray()](https://github.com/google/closure-library/blob/8598d87242af59aac233270742c8984e2b2bdbe0/closure/goog/crypt/crypt.js#L117-L143). The fact that strings are UTF16 in the JavaScript's memory has been an opening to me!
toUTF8Array()
contains an error which results in incorrect utf-8 encoding for surrogate pairs branch.
In the line containing charcode = ((charcode&0x3ff)<<10)|(str.charCodeAt(i)&0x3ff)
its necessary to add 0x010000
to make it work properly.
The error can be easily reproduced using glyph '🜄' which stands for 'Alchemical Symbol For Water'.
This should be rendered to: [240, 159, 156, 132]
or hex 0xF0 0x9F 0x9C 0x84
but ends up to be [240, 143, 144, 128]
.
Correcting the according line to charcode = (((charcode&0x3ff)<<10)|(str.charCodeAt(i)&0x3ff)) + 0x010000;
fixes that error.
Btw. function fromUTF8Array()
does not suffer from this issue.
"क्षति" gives: (15) [224, 164, 149, 224, 165, 141, 224, 164, 183, 224, 164, 164, 224, 164, 191]
I can't imagine this being anything else than wrong.
"क्षति" gives: (15) [224, 164, 149, 224, 165, 141, 224, 164, 183, 224, 164, 164, 224, 164, 191]
I can't imagine this being anything else than wrong.
That is in fact the correct result: "क्षति" is a string of 5 Unicode code points, and that is how they are encoded in UTF-8.
toUTF8Array()
contains an error which results in incorrect utf-8 encoding for surrogate pairs branch.In the line containing
charcode = ((charcode&0x3ff)<<10)|(str.charCodeAt(i)&0x3ff)
its necessary to add0x010000
to make it work properly.The error can be easily reproduced using glyph '🜄' which stands for 'Alchemical Symbol For Water'.
This should be rendered to:[240, 159, 156, 132]
or hex0xF0 0x9F 0x9C 0x84
but ends up to be[240, 143, 144, 128]
.Correcting the according line to
charcode = (((charcode&0x3ff)<<10)|(str.charCodeAt(i)&0x3ff)) + 0x010000;
fixes that error.Btw. function
fromUTF8Array()
does not suffer from this issue.
The latest version (from Aug 6 2013) no longer has this bug. Did you perchance refer to a previous version of this gist?
toUTF8Array()
contains an error which results in incorrect utf-8 encoding for surrogate pairs branch.
In the line containingcharcode = ((charcode&0x3ff)<<10)|(str.charCodeAt(i)&0x3ff)
its necessary to add0x010000
to make it work properly.
The error can be easily reproduced using glyph '🜄' which stands for 'Alchemical Symbol For Water'.
This should be rendered to:[240, 159, 156, 132]
or hex0xF0 0x9F 0x9C 0x84
but ends up to be[240, 143, 144, 128]
.
Correcting the according line tocharcode = (((charcode&0x3ff)<<10)|(str.charCodeAt(i)&0x3ff)) + 0x010000;
fixes that error.
Btw. functionfromUTF8Array()
does not suffer from this issue.The latest version (from Aug 6 2013) no longer has this bug. Did you perchance refer to a previous version of this gist?
The latest version on the top render toUTF8Array('🜄') as (4) [240, 143, 156, 132]
Where as mos0711 pointed out, it should be [240, 159, 156, 132] with his fix.
Hi, I really would like to use this useful snippet in my code but I wondering about a legal stuff like a copyright, licensing etc. I can't see any copyright notice in this snippet nor on @joni github profile. In this case, according to GitHub policy I can't use this code :(
https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/licensing-a-repository
Any thoughts how can I deal with such legal problems? I wondering if maybe @joni you could add some commonly used open source license to your snippet? Or any other ideas?
Very usefull! Thank you..
But how shoud I do to decode a UTF-8 array to a Javascript (utf16) String?