Skip to content

Instantly share code, notes, and snippets.

@gerardpaapu
Created December 9, 2011 11:45
Show Gist options
  • Save gerardpaapu/1451221 to your computer and use it in GitHub Desktop.
Save gerardpaapu/1451221 to your computer and use it in GitHub Desktop.
UTF-8 encoding in javascript
var assert = require('assert'),
toBytes = require('./utf8.js').toBytes,
toUTF8Bytes = require('./utf8.js').toUTF8Bytes,
fromUTF8Bytes = require('./utf8.js').fromUTF8Bytes;
assert.deepEqual(toBytes(0x0), [0]);
assert.deepEqual(toBytes(0xff), [0xff]);
assert.deepEqual(toBytes(0x01ff), [0x01, 0xff]);
assert.deepEqual(toBytes(0xffff), [0xff, 0xff]);
assert.deepEqual(toBytes(0x00A2, 2), [0x00, 0xA2]);
assert.deepEqual(toUTF8Bytes(0x00A2), [0xC2, 0xA2]);
assert.deepEqual(toBytes(0x20ac, 2), [0x20, 0xac]);
assert.deepEqual(toUTF8Bytes(0x20ac), [0xe2, 0x82, 0xac]);
assert.deepEqual(toBytes(0x024b62, 3), [0x02, 0x4b, 0x62]);
assert.deepEqual(toUTF8Bytes(0x024b62), [0xf0, 0xa4, 0xad, 0xa2]);
assert.equal(fromUTF8Bytes([0x40]), '@');
assert.equal(fromUTF8Bytes([0xc2, 0xa2]), '\u00a2');
assert.equal(fromUTF8Bytes([0xe2, 0x82, 0xac]), '\u20ac');
assert.equal(fromUTF8Bytes([0xf0, 0xa4, 0xad, 0xa2]), String.fromCharCode(0x024b62));
/*jshint bitwise: false, onevar: false */
var Bytes;
(function () {
var toBytes, toUTF8Bytes, fromUTF8Bytes, readFromBytes, assert;
assert = function (test, message) {
if (!test) throw new Error(message);
};
Bytes = function Bytes (data){ this.data = data; };
Bytes.fromString = function (str, format, method, replacer) {
switch (format) {
case 'ascii':
return Bytes.fromStringAscii(str, method, replacer);
case 'utf8':
return Bytes.fromStringUTF8(str);
default:
throw new TypeError('bad format ' + format);
}
};
Bytes.prototype.toString = function (format) {
switch (format) {
case 'ascii':
return this.asciiBytesToString();
case 'utf-8':
return this.utf8BytesToString();
default:
throw new TypeError('Bad format ' + format);
}
};
Bytes.fromStringAscii = function (str, method, replacer) {
var data = [],
max = str.length,
i, char;
replacer = typeof replacer == 'function' ? replacer
: function () { return replacer; };
while (i--) {
char = str.charCodeAt(i);
if (char < 128) {
data[i] = char;
} else {
switch (method) {
case 'elide':
break;
case 'replace':
data[i] = replacer(char);
break;
case 'entity':
data[i] = '&#' + char + ';';
break;
default:
throw new Error("Character out of range: " + char);
}
}
}
};
Bytes.asciiBytesToString = function () {
return String.fromCharCode.apply(null, this.data);
};
Bytes.fromStringUTF8 = function (source) {
var data = [], max = source.length, i, ch;
for (i = 0; i < max; i++) {
ch = source.charCodeAt(i);
[].push.apply( data, toUTF8Bytes(i) );
}
return new Bytes(data);
};
Bytes.prototype.utf8BytesToString = function () {
return fromUTF8Bytes(this.data);
};
toBytes = function (n, l) {
var result = [];
do {
result.unshift( n % 256 );
n = n >>> 8;
} while (n > 0);
while (result.length < l) {
result.unshift(0);
}
return result;
};
toUTF8Bytes = function (char) {
var bytes = [];
if (char <= 0x007F) {
return [ char ];
}
if (char <= 0x07FF) {
bytes = toBytes(char, 2);
return [
0xc0 | bytes[0] << 2 | bytes[1] >>> 6,
0x80 | (bytes[1] & 0x3f)
];
}
if (char <= 0xFFFF) {
bytes = toBytes(char, 2);
return [
0xe0 | bytes[0] >>> 4,
0x80 | ((bytes[0] << 2 | bytes[1] >>> 6) & 0x3f),
0x80 | (bytes[1] & 0x3f)
];
}
if (char <= 0x10FFFF) {
bytes = toBytes(char, 3);
return [
0xf0 | (bytes[0] >>> 2),
0x80 | ((bytes[0] << 4 | bytes[1] >>> 4) & 0x3f),
0x80 | ((bytes[1] << 2 | bytes[2] >>> 6) & 0x3f),
0x80 | (bytes[2] & 0x3f)
];
}
throw new TypeError('Invalid Codepoint: ' + char.toString(16));
};
fromUTF8Bytes = function (bytes) {
var max = bytes.length, i = 0, chars = [], add;
add = function (cp) {
chars.push(cp);
};
while (i < max) {
i += readFromBytes(bytes, i, add);
}
return String.fromCharCode.apply(null, chars);
};
// Reads a single character and calls emit with that character-point,
// returns the number of bytes consumed
readFromBytes = function (bytes, i, emit) {
var byte, slice, w, x, y, z;
byte = bytes[i];
if (byte == 0xc0 || byte == 0xc1 || byte > 0xf4) {
throw new Error("Illegal byte: " + byte);
}
if (byte <= 0x7f) {
emit(byte);
return 1;
}
if ((byte & 0xe0) === 0xc0) {
slice = bytes.slice(i, i + 2);
assert(slice.length === 2, 'unexpected end of input');
emit((slice[0] & 0x1f) << 6 |
(slice[1] & 0x3f));
return 2;
}
if ((byte & 0xf0) === 0xe0) {
slice = bytes.slice(i, i + 3);
assert(slice.length === 3, 'unexpected end of input');
z = slice[0] & 0xf;
y = slice[1] & 0x3f;
x = slice[2] & 0x3f;
emit( z << 12 | y << 6 | x );
return 3;
}
if ((byte & 0xf8) === 0xf0) {
slice = bytes.slice(i, i + 4);
assert(slice.length === 4, 'unexpected end of input');
w = slice[0] & 0x7;
x = slice[1] & 0x3f;
y = slice[2] & 0x3f;
z = slice[3] & 0x3f;
emit(w << 18 | x << 12 | y << 6 | z);
return 4;
}
throw new TypeError("Illegal Byte: " + byte);
};
if (typeof exports != 'undefined') {
exports.toBytes = toBytes;
exports.toUTF8Bytes = toUTF8Bytes;
exports.fromUTF8Bytes = fromUTF8Bytes;
}
}());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment