Created
December 9, 2011 11:45
-
-
Save gerardpaapu/1451221 to your computer and use it in GitHub Desktop.
UTF-8 encoding in javascript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var assert = require('assert'), | |
toBytes = require('./utf8.js').toBytes, | |
toUTF8Bytes = require('./utf8.js').toUTF8Bytes, | |
fromUTF8Bytes = require('./utf8.js').fromUTF8Bytes; | |
assert.deepEqual(toBytes(0x0), [0]); | |
assert.deepEqual(toBytes(0xff), [0xff]); | |
assert.deepEqual(toBytes(0x01ff), [0x01, 0xff]); | |
assert.deepEqual(toBytes(0xffff), [0xff, 0xff]); | |
assert.deepEqual(toBytes(0x00A2, 2), [0x00, 0xA2]); | |
assert.deepEqual(toUTF8Bytes(0x00A2), [0xC2, 0xA2]); | |
assert.deepEqual(toBytes(0x20ac, 2), [0x20, 0xac]); | |
assert.deepEqual(toUTF8Bytes(0x20ac), [0xe2, 0x82, 0xac]); | |
assert.deepEqual(toBytes(0x024b62, 3), [0x02, 0x4b, 0x62]); | |
assert.deepEqual(toUTF8Bytes(0x024b62), [0xf0, 0xa4, 0xad, 0xa2]); | |
assert.equal(fromUTF8Bytes([0x40]), '@'); | |
assert.equal(fromUTF8Bytes([0xc2, 0xa2]), '\u00a2'); | |
assert.equal(fromUTF8Bytes([0xe2, 0x82, 0xac]), '\u20ac'); | |
assert.equal(fromUTF8Bytes([0xf0, 0xa4, 0xad, 0xa2]), String.fromCharCode(0x024b62)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*jshint bitwise: false, onevar: false */ | |
var Bytes; | |
(function () { | |
var toBytes, toUTF8Bytes, fromUTF8Bytes, readFromBytes, assert; | |
assert = function (test, message) { | |
if (!test) throw new Error(message); | |
}; | |
Bytes = function Bytes (data){ this.data = data; }; | |
Bytes.fromString = function (str, format, method, replacer) { | |
switch (format) { | |
case 'ascii': | |
return Bytes.fromStringAscii(str, method, replacer); | |
case 'utf8': | |
return Bytes.fromStringUTF8(str); | |
default: | |
throw new TypeError('bad format ' + format); | |
} | |
}; | |
Bytes.prototype.toString = function (format) { | |
switch (format) { | |
case 'ascii': | |
return this.asciiBytesToString(); | |
case 'utf-8': | |
return this.utf8BytesToString(); | |
default: | |
throw new TypeError('Bad format ' + format); | |
} | |
}; | |
Bytes.fromStringAscii = function (str, method, replacer) { | |
var data = [], | |
max = str.length, | |
i, char; | |
replacer = typeof replacer == 'function' ? replacer | |
: function () { return replacer; }; | |
while (i--) { | |
char = str.charCodeAt(i); | |
if (char < 128) { | |
data[i] = char; | |
} else { | |
switch (method) { | |
case 'elide': | |
break; | |
case 'replace': | |
data[i] = replacer(char); | |
break; | |
case 'entity': | |
data[i] = '&#' + char + ';'; | |
break; | |
default: | |
throw new Error("Character out of range: " + char); | |
} | |
} | |
} | |
}; | |
Bytes.asciiBytesToString = function () { | |
return String.fromCharCode.apply(null, this.data); | |
}; | |
Bytes.fromStringUTF8 = function (source) { | |
var data = [], max = source.length, i, ch; | |
for (i = 0; i < max; i++) { | |
ch = source.charCodeAt(i); | |
[].push.apply( data, toUTF8Bytes(i) ); | |
} | |
return new Bytes(data); | |
}; | |
Bytes.prototype.utf8BytesToString = function () { | |
return fromUTF8Bytes(this.data); | |
}; | |
toBytes = function (n, l) { | |
var result = []; | |
do { | |
result.unshift( n % 256 ); | |
n = n >>> 8; | |
} while (n > 0); | |
while (result.length < l) { | |
result.unshift(0); | |
} | |
return result; | |
}; | |
toUTF8Bytes = function (char) { | |
var bytes = []; | |
if (char <= 0x007F) { | |
return [ char ]; | |
} | |
if (char <= 0x07FF) { | |
bytes = toBytes(char, 2); | |
return [ | |
0xc0 | bytes[0] << 2 | bytes[1] >>> 6, | |
0x80 | (bytes[1] & 0x3f) | |
]; | |
} | |
if (char <= 0xFFFF) { | |
bytes = toBytes(char, 2); | |
return [ | |
0xe0 | bytes[0] >>> 4, | |
0x80 | ((bytes[0] << 2 | bytes[1] >>> 6) & 0x3f), | |
0x80 | (bytes[1] & 0x3f) | |
]; | |
} | |
if (char <= 0x10FFFF) { | |
bytes = toBytes(char, 3); | |
return [ | |
0xf0 | (bytes[0] >>> 2), | |
0x80 | ((bytes[0] << 4 | bytes[1] >>> 4) & 0x3f), | |
0x80 | ((bytes[1] << 2 | bytes[2] >>> 6) & 0x3f), | |
0x80 | (bytes[2] & 0x3f) | |
]; | |
} | |
throw new TypeError('Invalid Codepoint: ' + char.toString(16)); | |
}; | |
fromUTF8Bytes = function (bytes) { | |
var max = bytes.length, i = 0, chars = [], add; | |
add = function (cp) { | |
chars.push(cp); | |
}; | |
while (i < max) { | |
i += readFromBytes(bytes, i, add); | |
} | |
return String.fromCharCode.apply(null, chars); | |
}; | |
// Reads a single character and calls emit with that character-point, | |
// returns the number of bytes consumed | |
readFromBytes = function (bytes, i, emit) { | |
var byte, slice, w, x, y, z; | |
byte = bytes[i]; | |
if (byte == 0xc0 || byte == 0xc1 || byte > 0xf4) { | |
throw new Error("Illegal byte: " + byte); | |
} | |
if (byte <= 0x7f) { | |
emit(byte); | |
return 1; | |
} | |
if ((byte & 0xe0) === 0xc0) { | |
slice = bytes.slice(i, i + 2); | |
assert(slice.length === 2, 'unexpected end of input'); | |
emit((slice[0] & 0x1f) << 6 | | |
(slice[1] & 0x3f)); | |
return 2; | |
} | |
if ((byte & 0xf0) === 0xe0) { | |
slice = bytes.slice(i, i + 3); | |
assert(slice.length === 3, 'unexpected end of input'); | |
z = slice[0] & 0xf; | |
y = slice[1] & 0x3f; | |
x = slice[2] & 0x3f; | |
emit( z << 12 | y << 6 | x ); | |
return 3; | |
} | |
if ((byte & 0xf8) === 0xf0) { | |
slice = bytes.slice(i, i + 4); | |
assert(slice.length === 4, 'unexpected end of input'); | |
w = slice[0] & 0x7; | |
x = slice[1] & 0x3f; | |
y = slice[2] & 0x3f; | |
z = slice[3] & 0x3f; | |
emit(w << 18 | x << 12 | y << 6 | z); | |
return 4; | |
} | |
throw new TypeError("Illegal Byte: " + byte); | |
}; | |
if (typeof exports != 'undefined') { | |
exports.toBytes = toBytes; | |
exports.toUTF8Bytes = toUTF8Bytes; | |
exports.fromUTF8Bytes = fromUTF8Bytes; | |
} | |
}()); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment