Skip to content

Instantly share code, notes, and snippets.

@teramako
Created June 1, 2011 14:05
Show Gist options
  • Save teramako/1002350 to your computer and use it in GitHub Desktop.
Save teramako/1002350 to your computer and use it in GitHub Desktop.
Unicode ⇔ UTF-8
const table = [
0xC0, // 192(11000000) 2 byte
0xe0, // 224(11100000) 3 byte
0xf0 // 240(11110000) 4 byte
];
/**
* Unicode文字をUTF-8バイト列に変換
* @param {String} str
* @return {Number[]}
*/
function toUTF8Octets (str) {
var result = [];
/**
* Unicode符号位置からUTF-8バイト数を返す
* @param {Number} code
* @return {Number}
*/
function getLength (code) {
if (code < 0x80) // 0 - 0x7F
return 1;
else if (code < 0x000800) // 0x80 - 0x07FF
return 2;
else if (code < 0x010000) // 0x0800 - 0x0FFF
return 3;
else if (code < 0x110000) // 0x01000 - 0x10FFFF
return 4;
throw new Error("Invalid");
}
/**
* Unicode符号位置をUTF-8バイト列に
* @param {Number} code
* @return {Number[]}
*/
function charToBytes (code) {
//console.log("code:", code);
if (code < 0x80)
return [code];
var count = getLength(code),
bytes = [];
for (var i = 1; i < count; ++i) {
var n = code & 0x3F;
code = code >> 6;
bytes.unshift(n + 0x80);
}
bytes.unshift(table[count-2] | code);
return bytes;
}
//console.log("encode:", encodeURI(str));
for (var i = 0, len = str.length; i < len; ++i) {
// String#charCodeAt(n) は UTF-16 の n 番目の数値を返す
var code = str.charCodeAt(i);
// 0xD800 ~ 0xD8FF なら上位サロゲートと判断(BMP範囲内ならスルー)
// 下位サロゲートを得て
// 上位サロゲートの下位10ビットと下位サロゲートの下位10ビットをつなげる
// さらに 0x10000を足すと Unicode 符号位置となる
if (0xD800 <= code && code <= 0xD8FF) {
if (i + 1 < len) {
var code2 = str.charCodeAt(i+1);
// 下位サロゲート
if (0xDC00 <= code2 && code2 <= 0xDFFF) {
code = ((code & 0x03FF) << 10) + (code2 & 0x03FF) + 0x10000;
++i;
} else
throw new Error("Invalid: surrogate 2");
} else
throw new Error("Invalid: surrogate 1")
}
result.push.apply(result, charToBytes(code));
}
return result;
}
/**
* UTF-8バイト列をUnicode文字へ変換
* @param {Number[]} bytes
* @return {String}
*/
function fromUTF8Octets (bytes) {
var res = [];
out:
for (var i = 0, len = bytes.length; i < len; ++i) {
var b = bytes[i];
if (b < 0x80) {
res.push(String.fromCharCode(b));
continue;
}
for (var j = 2; j >= 0; --j) {
if (b >= table[j]) {
var code = (b & ((0xFE - table[j]) >> 1)) << (6 * (1+j));
for (var k = 0; k <= j; ++k) {
if (k + i >= len)
throw new Error("Invalid");
code += (bytes[k + i + 1] & 0x3F) << (6 * (j-k));
}
if (code < 0x10000) {
res.push(String.fromCharCode(code));
}
else if (code < 0x10FFFF) {
// サロゲートペアの計算
code -= 0x10000;
var w1 = 0xD800 | (code >> 10);
var w2 = 0xDC00 | (code & 0x03FF);
res.push(String.fromCharCode(w1, w2));
} else {
throw new Error("Invalid");
}
i += j + 1;
continue out;
}
}
throw new Error("Invalid");
}
return res.join("");
}
// vim: sw=2 ts=2 et:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment