Created
June 1, 2011 14:05
-
-
Save teramako/1002350 to your computer and use it in GitHub Desktop.
Unicode ⇔ UTF-8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const table = [ | |
0xC0, // 192(11000000) 2 byte | |
0xe0, // 224(11100000) 3 byte | |
0xf0 // 240(11110000) 4 byte | |
]; | |
/** | |
* Unicode文字をUTF-8バイト列に変換 | |
* @param {String} str | |
* @return {Number[]} | |
*/ | |
function toUTF8Octets (str) { | |
var result = []; | |
/** | |
* Unicode符号位置からUTF-8バイト数を返す | |
* @param {Number} code | |
* @return {Number} | |
*/ | |
function getLength (code) { | |
if (code < 0x80) // 0 - 0x7F | |
return 1; | |
else if (code < 0x000800) // 0x80 - 0x07FF | |
return 2; | |
else if (code < 0x010000) // 0x0800 - 0x0FFF | |
return 3; | |
else if (code < 0x110000) // 0x01000 - 0x10FFFF | |
return 4; | |
throw new Error("Invalid"); | |
} | |
/** | |
* Unicode符号位置をUTF-8バイト列に | |
* @param {Number} code | |
* @return {Number[]} | |
*/ | |
function charToBytes (code) { | |
//console.log("code:", code); | |
if (code < 0x80) | |
return [code]; | |
var count = getLength(code), | |
bytes = []; | |
for (var i = 1; i < count; ++i) { | |
var n = code & 0x3F; | |
code = code >> 6; | |
bytes.unshift(n + 0x80); | |
} | |
bytes.unshift(table[count-2] | code); | |
return bytes; | |
} | |
//console.log("encode:", encodeURI(str)); | |
for (var i = 0, len = str.length; i < len; ++i) { | |
// String#charCodeAt(n) は UTF-16 の n 番目の数値を返す | |
var code = str.charCodeAt(i); | |
// 0xD800 ~ 0xD8FF なら上位サロゲートと判断(BMP範囲内ならスルー) | |
// 下位サロゲートを得て | |
// 上位サロゲートの下位10ビットと下位サロゲートの下位10ビットをつなげる | |
// さらに 0x10000を足すと Unicode 符号位置となる | |
if (0xD800 <= code && code <= 0xD8FF) { | |
if (i + 1 < len) { | |
var code2 = str.charCodeAt(i+1); | |
// 下位サロゲート | |
if (0xDC00 <= code2 && code2 <= 0xDFFF) { | |
code = ((code & 0x03FF) << 10) + (code2 & 0x03FF) + 0x10000; | |
++i; | |
} else | |
throw new Error("Invalid: surrogate 2"); | |
} else | |
throw new Error("Invalid: surrogate 1") | |
} | |
result.push.apply(result, charToBytes(code)); | |
} | |
return result; | |
} | |
/** | |
* UTF-8バイト列をUnicode文字へ変換 | |
* @param {Number[]} bytes | |
* @return {String} | |
*/ | |
function fromUTF8Octets (bytes) { | |
var res = []; | |
out: | |
for (var i = 0, len = bytes.length; i < len; ++i) { | |
var b = bytes[i]; | |
if (b < 0x80) { | |
res.push(String.fromCharCode(b)); | |
continue; | |
} | |
for (var j = 2; j >= 0; --j) { | |
if (b >= table[j]) { | |
var code = (b & ((0xFE - table[j]) >> 1)) << (6 * (1+j)); | |
for (var k = 0; k <= j; ++k) { | |
if (k + i >= len) | |
throw new Error("Invalid"); | |
code += (bytes[k + i + 1] & 0x3F) << (6 * (j-k)); | |
} | |
if (code < 0x10000) { | |
res.push(String.fromCharCode(code)); | |
} | |
else if (code < 0x10FFFF) { | |
// サロゲートペアの計算 | |
code -= 0x10000; | |
var w1 = 0xD800 | (code >> 10); | |
var w2 = 0xDC00 | (code & 0x03FF); | |
res.push(String.fromCharCode(w1, w2)); | |
} else { | |
throw new Error("Invalid"); | |
} | |
i += j + 1; | |
continue out; | |
} | |
} | |
throw new Error("Invalid"); | |
} | |
return res.join(""); | |
} | |
// vim: sw=2 ts=2 et: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment